diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -93,7 +93,7 @@ MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg()) { Register Reg = Src0.getReg(); - if (Reg.isVirtual() && MRI->hasOneUse(Reg)) { + if (Reg.isVirtual()) { MachineInstr *Def = MRI->getUniqueVRegDef(Reg); if (Def && Def->isMoveImmediate()) { MachineOperand &MovSrc = Def->getOperand(1); @@ -115,8 +115,8 @@ } if (ConstantFolded) { - assert(MRI->use_empty(Reg)); - Def->eraseFromParent(); + if (MRI->use_nodbg_empty(Reg)) + Def->eraseFromParent(); ++NumLiteralConstantsFolded; return true; } @@ -739,11 +739,7 @@ } } - // FIXME: We also need to consider movs of constant operands since - // immediate operands are not folded if they have more than one use, and - // the operand folding pass is unaware if the immediate will be free since - // it won't know if the src == dest constraint will end up being - // satisfied. + // Try to use S_ADDK_I32 and S_MULK_I32. if (MI.getOpcode() == AMDGPU::S_ADD_I32 || MI.getOpcode() == AMDGPU::S_MUL_I32) { const MachineOperand *Dest = &MI.getOperand(0); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -94,9 +94,8 @@ ; GFX8-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v1 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll @@ -80,11 +80,10 @@ ; GFX7-LABEL: v_uaddo_i8: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -93,11 +92,10 @@ ; GFX8-LABEL: v_uaddo_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 @@ -123,11 +121,10 @@ ; GFX7-LABEL: v_uaddo_i7: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0x7f -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -136,11 +133,10 @@ ; GFX8-LABEL: v_uaddo_i7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 @@ -149,11 +145,10 @@ ; GFX9-LABEL: v_uaddo_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 @@ -576,11 +571,10 @@ ; GFX7-LABEL: s_uaddo_i8: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -589,11 +583,10 @@ ; GFX8-LABEL: s_uaddo_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 @@ -619,11 +612,10 @@ ; GFX7-LABEL: s_uaddo_i7: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0x7f -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -632,11 +624,10 @@ ; GFX8-LABEL: s_uaddo_i7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 @@ -645,11 +636,10 @@ ; GFX9-LABEL: s_uaddo_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 @@ -966,11 +956,10 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) { ; GFX7-LABEL: uaddo_i16_sv: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -979,11 +968,10 @@ ; ; GFX8-LABEL: uaddo_i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s1, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -559,12 +559,11 @@ ; GFX6-LABEL: v_andn2_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v1 @@ -816,18 +815,17 @@ ; GFX6-LABEL: v_andn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v8 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v8 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_and_b32_e32 v4, v6, v8 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -722,11 +722,10 @@ ; GFX6-LABEL: v_ashr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -845,15 +844,14 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: ashr_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: v_ashr_i32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: v_ashr_i32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -889,10 +887,9 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -935,25 +932,24 @@ ; GFX6-LABEL: v_ashr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1098,46 +1094,44 @@ ; GFX6-LABEL: v_ashr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v9 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v11 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, v2, v16 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, v8, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v5, v16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v4, v16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, v7, v16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, v6, v16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -66,9 +66,8 @@ ; GFX9-CONTRACT-LABEL: test_f16_sub_ext_neg_mul: ; GFX9-CONTRACT: ; %bb.0: ; %entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: s_mov_b32 s4, 0x8000 -; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -892,18 +892,18 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_lshr_b32 s1, s2, 2 -; GFX7-NEXT: s_and_b32 s2, s2, 3 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: s_and_b32 s1, s2, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_lshl_b32 s0, s1, 3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 @@ -918,7 +918,6 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: s_lshl_b32 s0, s2, 3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog @@ -1019,7 +1018,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 @@ -1028,9 +1026,9 @@ ; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 @@ -2291,58 +2289,56 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v4, 0xff -; GFX7-NEXT: s_lshr_b32 s1, s2, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_and_b32 s2, s2, 3 +; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v9, s0, v0 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 +; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v15 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_or_b32_e32 v11, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7-NEXT: s_lshl_b32 s0, s2, 3 +; GFX7-NEXT: s_lshl_b32 s0, s1, 3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog @@ -2500,57 +2496,55 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff -; GFX7-NEXT: v_mov_b32_e32 v0, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v3 +; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v14, v5, v0 +; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 ; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v6 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v16 -; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 +; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -866,8 +866,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 @@ -881,7 +880,7 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 @@ -1001,8 +1000,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 @@ -1016,7 +1014,7 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 @@ -1182,8 +1180,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 @@ -1197,7 +1194,7 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -75,15 +75,14 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 4 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -137,14 +136,13 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s32 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, s32, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s32, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -305,14 +303,13 @@ ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x104 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x104, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -374,16 +371,16 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, vcc_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, vcc_hi, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -525,14 +522,13 @@ ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4004 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x4004, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -596,16 +592,16 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, vcc_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, vcc_hi, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -255,7 +255,7 @@ ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s2, 0x80000000 -; SI-NEXT: v_sub_f32_e32 v2, s2, v2 +; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 ; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| ; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] @@ -290,7 +290,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_sub_f32_e32 v4, s2, v7 +; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 ; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| ; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -308,7 +308,7 @@ ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s2, 0x80000000 -; GFX9-NEXT: v_sub_f32_e32 v1, s2, v1 +; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| ; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -186,9 +186,8 @@ ; GFX8-LABEL: v_fmul_v4f16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 @@ -223,9 +222,8 @@ ; GFX8-LABEL: v_fmul_v4f16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 @@ -336,10 +334,9 @@ ; GFX8-LABEL: v_fmul_v6f16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 @@ -381,10 +378,9 @@ ; GFX8-LABEL: v_fmul_v6f16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 -; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, 0x80008000, v5 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 @@ -515,11 +511,10 @@ ; GFX8-LABEL: v_fmul_v8f16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 @@ -567,11 +562,10 @@ ; GFX8-LABEL: v_fmul_v8f16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 -; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX8-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, 0x80008000, v5 +; GFX8-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 +; GFX8-NEXT: v_xor_b32_e32 v7, 0x80008000, v7 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -398,19 +398,17 @@ ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_mov_b32 s4, 0x80008000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 @@ -427,11 +425,10 @@ ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX8-NEXT: v_log_f16_e32 v2, v0 ; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -450,15 +447,14 @@ ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0x80008000 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX9-NEXT: v_log_f16_e32 v2, v0 ; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -10,7 +10,6 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s2, s2, 0x7f -; GFX6-NEXT: s_movk_i32 s3, 0x7f ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -27,8 +26,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 6, v0 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshr_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -41,10 +40,9 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f ; GFX8-NEXT: s_and_b32 s1, s1, 0x7f -; GFX8-NEXT: s_movk_i32 s3, 0x7f +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 ; GFX8-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -59,8 +57,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u16_e32 v1, 6, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -73,10 +71,9 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f ; GFX9-NEXT: s_and_b32 s1, s1, 0x7f -; GFX9-NEXT: s_movk_i32 s3, 0x7f +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -91,8 +88,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u16_e32 v1, 6, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -147,7 +144,6 @@ ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7f ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 @@ -157,9 +153,9 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -170,15 +166,14 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 @@ -188,9 +183,9 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -201,15 +196,14 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 @@ -219,9 +213,9 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v3 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -658,9 +652,8 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -996,23 +989,22 @@ ; GFX6-NEXT: v_bfe_u32 v4, v1, 16, 8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1059,12 +1051,11 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1114,8 +1105,8 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1189,10 +1180,9 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX6-NEXT: s_mov_b32 s3, 0xffffff +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -1206,8 +1196,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshr_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1220,10 +1210,9 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX8-NEXT: s_mov_b32 s3, 0xffffff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -1237,8 +1226,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1251,10 +1240,9 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX9-NEXT: s_mov_b32 s3, 0xffffff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -1268,8 +1256,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s1 ; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -1321,7 +1309,6 @@ ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 @@ -1331,9 +1318,9 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1352,7 +1339,6 @@ ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 @@ -1362,9 +1348,9 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1383,7 +1369,6 @@ ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 @@ -1393,8 +1378,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1438,11 +1423,11 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 -; GFX6-NEXT: s_and_b32 s10, s0, 0xff +; GFX6-NEXT: s_and_b32 s9, s0, 0xff ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: s_or_b32 s0, s10, s0 +; GFX6-NEXT: s_or_b32 s0, s9, s0 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 @@ -1461,12 +1446,12 @@ ; GFX6-NEXT: s_or_b32 s1, s1, s6 ; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_lshr_b32 s7, s2, 24 -; GFX6-NEXT: s_and_b32 s10, s2, 0xff +; GFX6-NEXT: s_and_b32 s9, s2, 0xff ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: s_or_b32 s2, s10, s2 +; GFX6-NEXT: s_or_b32 s2, s9, s2 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_lshr_b32 s8, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 @@ -1483,13 +1468,13 @@ ; GFX6-NEXT: s_or_b32 s3, s3, s6 ; GFX6-NEXT: s_lshr_b32 s6, s4, 16 ; GFX6-NEXT: s_lshr_b32 s7, s4, 24 -; GFX6-NEXT: s_and_b32 s10, s4, 0xff +; GFX6-NEXT: s_and_b32 s9, s4, 0xff ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX6-NEXT: s_or_b32 s4, s10, s4 +; GFX6-NEXT: s_or_b32 s4, s9, s4 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 @@ -1519,13 +1504,12 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: s_mov_b32 s6, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_lshr_b32 s0, s2, 1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1535,23 +1519,21 @@ ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: s_lshr_b32 s0, s3, 1 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: s_movk_i32 s9, 0xff +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s9, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s9, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_bfe_u32 v2, v1, 8, 8 @@ -1653,13 +1635,12 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: s_mov_b32 s6, 0xffffff ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX8-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1669,11 +1650,10 @@ ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -1784,32 +1764,30 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: s_mov_b32 s7, 0xffffff -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: v_and_b32_e32 v3, s7, v3 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 -; GFX9-NEXT: v_lshrrev_b32_e64 v3, v3, s2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffff -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 ; GFX9-NEXT: s_lshr_b32 s0, s3, 1 -; GFX9-NEXT: v_and_b32_e32 v2, v3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: s_mov_b32 s6, 8 ; GFX9-NEXT: v_lshl_or_b32 v1, s1, v1, v2 -; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_mov_b32 s8, 16 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -1961,18 +1939,17 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, 0xffffffe8 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v9, v9 ; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX6-NEXT: v_mul_lo_u32 v8, v7, v6 ; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v9 ; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v5, v5, v9 -; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v9 ; GFX6-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX6-NEXT: v_mul_lo_u32 v7, v7, v8 @@ -1987,9 +1964,9 @@ ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, v6, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v7, 24 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2001,10 +1978,10 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 23 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v9 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2016,18 +1993,17 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX8-NEXT: v_mov_b32_e32 v7, 0xffffffe8 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v9, v9 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX8-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX8-NEXT: v_mul_lo_u32 v8, v7, v6 ; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v8, v9 ; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v5, v5, v9 -; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v9 ; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8 @@ -2042,9 +2018,9 @@ ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 ; GFX8-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 23, v4 -; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v4, v6, v9 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v6 ; GFX8-NEXT: v_mul_lo_u32 v6, v7, 24 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2056,10 +2032,10 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_bfe_u32 v2, v3, 1, 23 -; GFX8-NEXT: v_and_b32_e32 v3, v4, v9 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2078,15 +2054,15 @@ ; GFX9-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 ; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GFX9-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX9-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX9-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v9 ; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX9-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX9-NEXT: v_mul_hi_u32 v7, v9, v7 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 ; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v5, v5, v8 +; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 ; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 @@ -2095,11 +2071,10 @@ ; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 -; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, v7 ; GFX9-NEXT: v_sub_u32_e32 v7, 23, v4 -; GFX9-NEXT: v_and_b32_e32 v7, v7, v8 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v8 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v7, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v4, v2 @@ -2111,8 +2086,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v8 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3308,10 +3283,9 @@ ; GFX9-LABEL: v_fshl_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v2 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 @@ -3445,15 +3419,14 @@ ; ; GFX9-LABEL: v_fshl_v2i16_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v1, s2, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xf000f, v0 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s1, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: s_lshr_b32 s1, s1, 0x10001 ; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xf000f, v0 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 @@ -3939,17 +3912,16 @@ ; GFX9-LABEL: v_fshl_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v6, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xf000f, v4 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v6, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v5 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 @@ -4703,10 +4675,9 @@ ; GFX6-LABEL: v_fshl_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0x7f -; GFX6-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX6-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v14 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v14 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v8 @@ -4752,10 +4723,9 @@ ; GFX8-LABEL: v_fshl_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX8-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v14 ; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v14 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] @@ -4801,10 +4771,9 @@ ; GFX9-LABEL: v_fshl_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX9-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v14 ; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v14 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] @@ -4902,10 +4871,9 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s8, 0x7f -; GFX6-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v6 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v0 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v6 @@ -4955,10 +4923,9 @@ ; ; GFX8-LABEL: v_fshl_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s8, 0x7f -; GFX8-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v6 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] @@ -5008,10 +4975,9 @@ ; ; GFX9-LABEL: v_fshl_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s8, 0x7f -; GFX9-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v6 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] @@ -6022,8 +5988,7 @@ ; GFX6-LABEL: v_fshl_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s6, 0x7f -; GFX6-NEXT: v_and_b32_e32 v23, s6, v16 +; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 @@ -6031,7 +5996,7 @@ ; GFX6-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10 -; GFX6-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v17 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 @@ -6066,9 +6031,9 @@ ; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v17, v8 ; GFX6-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX6-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v20 -; GFX6-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v16 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 @@ -6114,8 +6079,7 @@ ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s6, 0x7f -; GFX8-NEXT: v_and_b32_e32 v23, s6, v16 +; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] @@ -6123,7 +6087,7 @@ ; GFX8-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10 -; GFX8-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX8-NEXT: v_or_b32_e32 v9, v9, v17 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 @@ -6158,9 +6122,9 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v17, v8 ; GFX8-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX8-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v20 -; GFX8-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v16 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] @@ -6206,8 +6170,7 @@ ; GFX9-LABEL: v_fshl_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s6, 0x7f -; GFX9-NEXT: v_and_b32_e32 v23, s6, v16 +; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] @@ -6215,7 +6178,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 31, v10 -; GFX9-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX9-NEXT: v_or_b32_e32 v9, v9, v17 ; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 @@ -6250,9 +6213,9 @@ ; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 ; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX9-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v20 -; GFX9-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -10,11 +10,10 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s2, s2, 0x7f -; GFX6-NEXT: s_movk_i32 s3, 0x7f ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s1, s1, 0x7f ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s1, s1, 0x7f ; GFX6-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -28,8 +27,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 6, v0 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshr_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 @@ -41,11 +40,10 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f -; GFX8-NEXT: s_movk_i32 s3, 0x7f ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -59,8 +57,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u16_e32 v1, 6, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -72,11 +70,10 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f -; GFX9-NEXT: s_movk_i32 s3, 0x7f ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -90,8 +87,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u16_e32 v1, 6, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 ; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s1 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 @@ -139,14 +136,13 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 @@ -156,8 +152,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -170,14 +166,13 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 @@ -187,8 +182,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -201,14 +196,13 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 @@ -218,8 +212,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -646,10 +640,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 @@ -660,7 +653,7 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -980,43 +973,42 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_and_b32_e32 v11, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, v10, v11 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX6-NEXT: v_and_b32_e32 v10, 7, v7 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v10, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v7 ; GFX6-NEXT: v_xor_b32_e32 v7, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, v10, v7 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX6-NEXT: v_and_b32_e32 v7, 7, v8 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v7 +; GFX6-NEXT: v_xor_b32_e32 v7, -1, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX6-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 7, v8 +; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v7, v1 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 7, v9 -; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v9 +; GFX6-NEXT: v_and_b32_e32 v3, 7, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, v7, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v6 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v3, v6 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1061,12 +1053,11 @@ ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1115,8 +1106,8 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1190,10 +1181,9 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX6-NEXT: s_mov_b32 s3, 0xffffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -1208,8 +1198,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshr_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1222,10 +1212,9 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX8-NEXT: s_mov_b32 s3, 0xffffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -1240,8 +1229,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1254,10 +1243,9 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX9-NEXT: s_mov_b32 s3, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -1272,8 +1260,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s1 ; GFX9-NEXT: v_lshl_or_b32 v0, s0, v1, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -1322,12 +1310,11 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 @@ -1337,8 +1324,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1354,12 +1341,11 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 @@ -1369,8 +1355,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1383,15 +1369,14 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffffe8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 @@ -1401,8 +1386,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1450,46 +1435,46 @@ ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s10, s0, 0xff +; GFX6-NEXT: s_and_b32 s9, s0, 0xff ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX6-NEXT: s_or_b32 s0, s10, s0 +; GFX6-NEXT: s_or_b32 s0, s9, s0 ; GFX6-NEXT: s_or_b32 s1, s7, s1 ; GFX6-NEXT: s_and_b32 s7, s8, 0xff ; GFX6-NEXT: s_lshr_b32 s8, s2, 16 -; GFX6-NEXT: s_lshr_b32 s10, s2, 24 -; GFX6-NEXT: s_and_b32 s12, s2, 0xff +; GFX6-NEXT: s_lshr_b32 s9, s2, 24 +; GFX6-NEXT: s_and_b32 s11, s2, 0xff ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s8, s8, 0xff -; GFX6-NEXT: s_or_b32 s2, s12, s2 +; GFX6-NEXT: s_or_b32 s2, s11, s2 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshr_b32 s11, s3, 8 +; GFX6-NEXT: s_lshr_b32 s10, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 -; GFX6-NEXT: s_and_b32 s8, s11, 0xff +; GFX6-NEXT: s_and_b32 s8, s10, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: s_or_b32 s3, s10, s3 +; GFX6-NEXT: s_or_b32 s3, s9, s3 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s8 ; GFX6-NEXT: s_lshr_b32 s8, s4, 16 -; GFX6-NEXT: s_lshr_b32 s10, s4, 24 -; GFX6-NEXT: s_and_b32 s12, s4, 0xff +; GFX6-NEXT: s_lshr_b32 s9, s4, 24 +; GFX6-NEXT: s_and_b32 s11, s4, 0xff ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 ; GFX6-NEXT: s_and_b32 s8, s8, 0xff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX6-NEXT: s_or_b32 s4, s12, s4 +; GFX6-NEXT: s_or_b32 s4, s11, s4 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 @@ -1498,7 +1483,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_lshr_b32 s11, s5, 8 +; GFX6-NEXT: s_lshr_b32 s10, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: s_and_b32 s5, s5, 0xff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 @@ -1507,9 +1492,9 @@ ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX6-NEXT: s_and_b32 s8, s11, 0xff +; GFX6-NEXT: s_and_b32 s8, s10, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: s_or_b32 s5, s10, s5 +; GFX6-NEXT: s_or_b32 s5, s9, s5 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 @@ -1523,13 +1508,12 @@ ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX6-NEXT: s_mov_b32 s8, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 ; GFX6-NEXT: s_lshl_b32 s4, s6, 17 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_or_b32 s0, s4, s0 -; GFX6-NEXT: v_and_b32_e32 v2, s8, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 @@ -1542,25 +1526,23 @@ ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 ; GFX6-NEXT: s_lshl_b32 s0, s7, 17 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX6-NEXT: s_movk_i32 s9, 0xff +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s9, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s9, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_bfe_u32 v2, v1, 8, 8 @@ -1656,14 +1638,13 @@ ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_mov_b32 s8, 0xffffff -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: s_lshl_b32 s4, s6, 17 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: v_and_b32_e32 v2, s8, v3 -; GFX8-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s2 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 @@ -1676,13 +1657,12 @@ ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 ; GFX8-NEXT: s_lshl_b32 s0, s7, 17 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s3 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 @@ -1786,39 +1766,37 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX9-NEXT: s_mov_b32 s10, 0xffffff ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 ; GFX9-NEXT: s_lshl_b32 s4, s7, 17 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_and_b32_e32 v0, s10, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX9-NEXT: s_or_b32 s0, s4, s0 -; GFX9-NEXT: v_and_b32_e32 v3, s10, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s2 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, v3, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffff -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 ; GFX9-NEXT: s_lshl_b32 s0, s9, 17 ; GFX9-NEXT: s_lshl_b32 s1, s1, 1 -; GFX9-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s3 ; GFX9-NEXT: s_mov_b32 s6, 8 -; GFX9-NEXT: v_lshl_or_b32 v1, s0, v3, v1 -; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_lshl_or_b32 v1, s0, v2, v1 ; GFX9-NEXT: s_mov_b32 s8, 16 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -1973,19 +1951,19 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 ; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v9 -; GFX6-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; GFX6-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 @@ -1995,15 +1973,14 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v7, v8 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 -; GFX6-NEXT: v_and_b32_e32 v7, v7, v9 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX6-NEXT: v_mul_hi_u32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v9 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 @@ -2013,8 +1990,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2030,19 +2007,19 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v8, v9 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 @@ -2052,15 +2029,14 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-NEXT: v_mul_lo_u32 v6, v7, v8 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 -; GFX8-NEXT: v_and_b32_e32 v7, v7, v9 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX8-NEXT: v_mul_hi_u32 v6, v8, v6 -; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v7, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_and_b32_e32 v3, v3, v9 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 @@ -2070,8 +2046,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2091,31 +2067,30 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_mul_lo_u32 v8, v7, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mul_hi_u32 v7, v9, v7 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 ; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v5, v5, v8 ; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v8 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 ; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v8 -; GFX9-NEXT: v_and_b32_e32 v6, v6, v8 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v6, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v5, v7 @@ -2126,8 +2101,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v8 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v8 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3215,10 +3190,9 @@ ; GFX9-LABEL: v_fshr_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v2 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 @@ -3381,13 +3355,12 @@ ; ; GFX9-LABEL: v_fshr_v2i16_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v1, s2, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: v_and_b32_e32 v1, 0xf000f, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 ; GFX9-NEXT: s_lshl_b32 s2, s2, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xf000f, v0 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s1 @@ -3894,12 +3867,11 @@ ; GFX6-LABEL: v_fshr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX6-NEXT: v_and_b32_e32 v8, v8, v12 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_or_b32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; GFX6-NEXT: v_and_b32_e32 v10, v10, v12 +; GFX6-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_bfe_u32 v10, v4, 1, 15 @@ -4029,17 +4001,16 @@ ; GFX9-LABEL: v_fshr_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v6, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xf000f, v4 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v4, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v6, v2 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v5 -; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v4, v1 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v2, v3 @@ -4820,11 +4791,10 @@ ; GFX6-LABEL: v_fshr_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0x7f -; GFX6-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 @@ -4869,11 +4839,10 @@ ; GFX8-LABEL: v_fshr_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 @@ -4918,11 +4887,10 @@ ; GFX9-LABEL: v_fshr_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 @@ -5019,11 +4987,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshr_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s8, 0x7f -; GFX6-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_mov_b32 s9, 0 -; GFX6-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s8, s1, 31 ; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 @@ -5072,11 +5039,10 @@ ; ; GFX8-LABEL: v_fshr_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s8, 0x7f -; GFX8-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_mov_b32 s9, 0 -; GFX8-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s8, s1, 31 ; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 @@ -5125,11 +5091,10 @@ ; ; GFX9-LABEL: v_fshr_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s8, 0x7f -; GFX9-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: s_mov_b32 s9, 0 -; GFX9-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s8, s1, 31 ; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 @@ -6145,17 +6110,16 @@ ; GFX6-LABEL: v_fshr_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s6, 0x7f ; GFX6-NEXT: v_xor_b32_e32 v17, -1, v16 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v23, s6, v17 +; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v17 ; GFX6-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 -; GFX6-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 ; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 @@ -6191,7 +6155,7 @@ ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX6-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 @@ -6204,7 +6168,7 @@ ; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v17 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v18 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX6-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc @@ -6237,17 +6201,16 @@ ; GFX8-LABEL: v_fshr_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s6, 0x7f ; GFX8-NEXT: v_xor_b32_e32 v17, -1, v16 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v23, s6, v17 +; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v17 ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 ; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 @@ -6283,7 +6246,7 @@ ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX8-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 @@ -6296,7 +6259,7 @@ ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v17, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[8:9], v18, v[8:9] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX8-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc @@ -6329,17 +6292,16 @@ ; GFX9-LABEL: v_fshr_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s6, 0x7f ; GFX9-NEXT: v_xor_b32_e32 v17, -1, v16 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v23, s6, v17 +; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 ; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 @@ -6375,7 +6337,7 @@ ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX9-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 @@ -6388,7 +6350,7 @@ ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], v18, v[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX9-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll @@ -82,9 +82,8 @@ ; The offset to the dynamic shared memory array should be aligned on the ; maximal one. ; CHECK-LABEL: {{^}}dynamic_shared_array_4: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} -; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] +; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x48, [[IDX]] define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() %vidx = add i32 %tid.x, %idx @@ -101,9 +100,8 @@ ; Honor the explicit alignment from the specified variable. ; CHECK-LABEL: {{^}}dynamic_shared_array_5: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} -; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] +; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x44, [[IDX]] define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() %vidx = add i32 %tid.x, %idx @@ -120,9 +118,8 @@ ; Honor the explicit alignment from the specified variable. ; CHECK-LABEL: {{^}}dynamic_shared_array_6: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} -; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] +; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x50, [[IDX]] define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() %vidx = add i32 %tid.x, %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -332,11 +332,10 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v1 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 @@ -547,12 +546,11 @@ ; GFX7-LABEL: insertelement_v_v2i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 @@ -1013,15 +1011,14 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 @@ -1306,13 +1303,12 @@ ; GFX7-LABEL: insertelement_v_v4i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 @@ -2001,20 +1997,19 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s8, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 @@ -2380,15 +2375,14 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 @@ -3486,17 +3480,16 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s20, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v9, s18 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s20, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mov_b32_e32 v10, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s20, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 @@ -4068,19 +4061,18 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -51,17 +51,16 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -133,17 +132,16 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -217,15 +215,14 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -301,17 +298,16 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -386,15 +382,14 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xff ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -466,17 +461,16 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -546,15 +540,14 @@ ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -624,15 +617,14 @@ ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -868,10 +860,10 @@ ; GFX9-LABEL: insertelement_s_v4i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_movk_i32 s5, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s1, 8 ; GFX9-NEXT: s_mov_b32 s2, 16 +; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 @@ -938,17 +930,16 @@ ; GFX7-LABEL: insertelement_s_v4i8_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s2, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s3, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_and_b32 s2, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s3, s3, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s3, s3, s5 +; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s3, s0 +; GFX7-NEXT: s_or_b32 s0, s2, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s4, 3 @@ -959,7 +950,7 @@ ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1088,28 +1079,27 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_movk_i32 s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s3, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_and_b32 s2, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s3, s3, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s3, s3, s5 +; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s3, s0 +; GFX7-NEXT: s_or_b32 s0, s2, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s4, 0xff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1236,29 +1226,28 @@ ; GFX7-LABEL: insertelement_s_v4i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s2, 0xff ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s3, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s4, s4, 8 +; GFX7-NEXT: s_and_b32 s2, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s3, s3, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s3, s3, s4 +; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s3, s0 +; GFX7-NEXT: s_or_b32 s0, s2, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 24 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1387,37 +1376,36 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: v_and_b32_e32 v1, 3, v2 ; GFX7-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v1 -; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -1533,18 +1521,17 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_and_b32 s1, s2, 3 -; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, s1, v1 -; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 +; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -1552,7 +1539,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 @@ -1678,38 +1665,36 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_movk_i32 s2, 0xff -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff -; GFX7-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v1 -; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v5, s2, v0 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2131,64 +2116,62 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_lshr_b32 s1, s3, 2 -; GFX7-NEXT: s_and_b32 s3, s3, 3 +; GFX7-NEXT: s_and_b32 s1, s3, 3 +; GFX7-NEXT: s_lshr_b32 s0, s3, 2 ; GFX7-NEXT: s_and_b32 s2, s2, 0xff -; GFX7-NEXT: s_lshl_b32 s3, s3, 3 -; GFX7-NEXT: s_lshl_b32 s2, s2, s3 -; GFX7-NEXT: s_lshl_b32 s3, 0xff, s3 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_not_b32 s3, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, s0, v0 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v7, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s3, v3 -; GFX7-NEXT: v_or_b32_e32 v3, s2, v3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -2373,7 +2356,6 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 @@ -2403,32 +2385,32 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4 -; GFX7-NEXT: v_or_b32_e32 v3, s3, v0 +; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_bfe_u32 v3, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2631,36 +2613,35 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_and_b32 s6, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_and_b32 s5, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s6, s6, s7 +; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s6, s0 +; GFX7-NEXT: s_or_b32 s0, s5, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s6, s1, 0x80008 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s1, 0xff -; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_and_b32 s2, s4, 0xff ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 @@ -2670,7 +2651,7 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -2887,37 +2868,36 @@ ; GFX7-LABEL: insertelement_s_v8i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_and_b32 s5, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_and_b32 s4, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s5, s5, s6 +; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s5, s0 +; GFX7-NEXT: s_or_b32 s0, s4, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX7-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s1, 0xff -; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_lshl_b32 s4, s4, 8 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX7-NEXT: s_or_b32 s2, s2, s5 +; GFX7-NEXT: s_or_b32 s2, s2, s4 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 @@ -2928,7 +2908,7 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -3131,64 +3111,62 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xff -; GFX7-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v5, s1, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v0 +; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v10, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3360,64 +3338,62 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v3, 0xff -; GFX7-NEXT: s_lshr_b32 s1, s2, 2 -; GFX7-NEXT: s_and_b32 s2, s2, 3 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX7-NEXT: s_lshl_b32 s2, s2, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 -; GFX7-NEXT: s_lshl_b32 s2, 0xff, s2 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_not_b32 s2, s2 +; GFX7-NEXT: s_and_b32 s1, s2, 3 +; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v6, s0, v0 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v7, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3588,62 +3564,60 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v4, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v0 +; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v10, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v6, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v4, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -4349,109 +4323,107 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v4, 0xff -; GFX7-NEXT: s_and_b32 s1, s3, 3 +; GFX7-NEXT: s_and_b32 s0, s3, 3 ; GFX7-NEXT: s_lshr_b32 s4, s3, 2 -; GFX7-NEXT: s_and_b32 s2, s2, 0xff -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: s_lshl_b32 s5, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s6, s1 +; GFX7-NEXT: s_not_b32 s6, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v9, s0, v0 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 +; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v16, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v15, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v3, v12, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 -; GFX7-NEXT: v_or_b32_e32 v5, s5, v5 +; GFX7-NEXT: v_or_b32_e32 v11, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v11, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v1 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -4757,7 +4729,6 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s0, 24 @@ -4811,58 +4782,58 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX7-NEXT: s_andn2_b32 s4, s6, s4 -; GFX7-NEXT: v_or_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_and_b32_e32 v5, v2, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -5256,56 +5227,55 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, 0xff -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v1, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v2, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -5638,26 +5608,26 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s4, s0, 24 -; GFX7-NEXT: s_and_b32 s9, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_and_b32 s8, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s9, s9, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s9, s9, s10 +; GFX7-NEXT: s_or_b32 s8, s8, s9 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s9, s0 +; GFX7-NEXT: s_or_b32 s0, s8, s0 ; GFX7-NEXT: s_lshl_b32 s4, s4, 24 -; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 +; GFX7-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s1, 24 ; GFX7-NEXT: s_or_b32 s4, s0, s4 ; GFX7-NEXT: s_and_b32 s0, s1, 0xff -; GFX7-NEXT: s_lshl_b32 s9, s9, 8 +; GFX7-NEXT: s_lshl_b32 s8, s8, 8 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX7-NEXT: s_or_b32 s0, s0, s9 +; GFX7-NEXT: s_or_b32 s0, s0, s8 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s5, 24 @@ -5687,66 +5657,64 @@ ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v6, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, 0xff -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v1, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v2, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -6038,109 +6006,107 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v7, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX7-NEXT: v_lshl_b32_e32 v18, s0, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v17 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v17 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 +; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v3 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v12, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v14, v5, v7 +; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v17, v6, 8, 8 +; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v16, v6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v6 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 -; GFX7-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 +; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshl_b32_e32 v19, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v18 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v18 -; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v19 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v18 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v7 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v11, v1, v7 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v13, v3, v7 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -6407,109 +6373,107 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v7, 0xff +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: s_lshr_b32 s4, s2, 2 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v7 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v3 +; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v4 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v12, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v14, v5, v7 +; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v17, v6, 8, 8 -; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v16, v6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v6 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: s_not_b32 s5, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 +; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 +; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc +; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v5, s5, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v7 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v11, v1, v7 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v2, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v13, v3, v7 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -6775,103 +6739,101 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v8, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v18 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v8 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v19 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v18 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 -; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v4, 8, 8 +; GFX7-NEXT: v_bfe_u32 v13, v5, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v11, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v13, s0, v5 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v15, v6, v8 +; GFX7-NEXT: v_bfe_u32 v15, v6, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v6 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 -; GFX7-NEXT: v_bfe_u32 v18, v7, 8, 8 +; GFX7-NEXT: v_bfe_u32 v17, v7, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v12, v13, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v7 -; GFX7-NEXT: v_and_b32_e32 v17, v7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v7 +; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v7 ; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX7-NEXT: v_or_b32_e32 v13, v15, v16 -; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v14, v17, v18 -; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX7-NEXT: v_or_b32_e32 v7, v14, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v7, v13, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v10 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, v2, s[0:1] +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v5, v2, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v8 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v11, v1, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v13, v3, v8 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -508,10 +508,9 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm d16 -; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0xffff ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, s0, v1 -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, s0, v2 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: ; return to shader part epilog @@ -589,10 +588,9 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm d16 -; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0xffff ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -46,14 +46,14 @@ ; GFX906-NEXT: s_movk_i32 s4, 0xff ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX906-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX906-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v6 -; GFX906-NEXT: v_and_b32_e32 v3, s4, v7 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7 ; GFX906-NEXT: v_and_or_b32 v1, v4, s4, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -46,14 +46,14 @@ ; GFX906-NEXT: s_movk_i32 s4, 0xff ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX906-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX906-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v6 -; GFX906-NEXT: v_and_b32_e32 v3, s4, v7 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7 ; GFX906-NEXT: v_and_or_b32 v1, v4, s4, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -8,9 +8,8 @@ ; GFX6-LABEL: v_lshr_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -105,9 +104,8 @@ ; GCN-LABEL: v_lshr_i24: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0xffffff -; GCN-NEXT: v_and_b32_e32 v1, s4, v1 -; GCN-NEXT: v_and_b32_e32 v0, s4, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -554,9 +552,8 @@ ; GFX6-LABEL: v_lshr_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -685,12 +682,11 @@ ; GFX6-LABEL: v_lshr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -801,11 +797,10 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: lshr_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -838,12 +833,11 @@ define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: lshr_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -887,18 +881,17 @@ ; GFX6-LABEL: v_lshr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1036,31 +1029,29 @@ ; GFX6-LABEL: v_lshr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 -; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 -; GFX6-NEXT: v_and_b32_e32 v7, v7, v16 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, v8, v7 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -38,9 +38,8 @@ ; GFX7-LABEL: v_mul_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -169,9 +168,8 @@ ; GFX7-LABEL: v_mul_i16_signext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll @@ -94,9 +94,8 @@ ; GFX8-LABEL: v_mul_v2i16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_mul_lo_u16_e32 v2, v0, v1 ; GFX8-NEXT: v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -559,12 +559,11 @@ ; GFX6-LABEL: v_orn2_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -816,18 +815,17 @@ ; GFX6-LABEL: v_orn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v8 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v8 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_and_b32_e32 v4, v6, v8 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -608,19 +608,17 @@ ; GFX6-LABEL: v_roundeven_v2f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_and_b32_e32 v5, s6, v1 -; GFX6-NEXT: s_mov_b32 s7, 0x43300000 +; GFX6-NEXT: v_and_b32_e32 v5, 0x80000000, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 +; GFX6-NEXT: v_or_b32_e32 v5, 0x43300000, v5 ; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, 0x432fffff ; GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] ; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX6-NEXT: v_and_b32_e32 v5, s6, v3 -; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX6-NEXT: v_or_b32_e32 v5, 0x43300000, v5 ; GFX6-NEXT: v_add_f64 v[7:8], v[2:3], v[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GFX6-NEXT: v_add_f64 v[4:5], v[7:8], -v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -259,10 +259,9 @@ ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -273,21 +272,19 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v4, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 -; GFX8-NEXT: v_sub_u16_e32 v4, s4, v4 +; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_max_i16_e32 v1, v5, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 -; GFX8-NEXT: v_sub_u16_e32 v1, s4, v1 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 ; GFX8-NEXT: v_max_i16_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 @@ -512,22 +509,21 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -540,33 +536,30 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v10, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v9, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v8, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v10, s5, v10 -; GFX8-NEXT: v_sub_u16_e32 v8, s4, v8 -; GFX8-NEXT: v_max_i16_e32 v1, v10, v1 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_max_i16_e32 v1, v9, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 ; GFX8-NEXT: v_min_i16_e32 v8, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_sub_u16_e32 v1, s4, v1 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 +; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 ; GFX8-NEXT: v_max_i16_e32 v2, v8, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 ; GFX8-NEXT: v_min_i16_e32 v6, 0, v2 -; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v6, s5, v6 -; GFX8-NEXT: v_sub_u16_e32 v4, v9, v4 +; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_max_i16_e32 v3, v6, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v3 @@ -575,7 +568,7 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX8-NEXT: v_max_i16_e32 v5, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6 -; GFX8-NEXT: v_sub_u16_e32 v5, v9, v5 +; GFX8-NEXT: v_sub_u16_e32 v5, 0x7fff, v5 ; GFX8-NEXT: v_max_i16_e32 v4, v6, v4 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 @@ -619,7 +612,7 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -853,7 +846,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -2699,19 +2692,17 @@ ; GFX8-LABEL: v_saddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, s4, v3 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3 ; GFX8-NEXT: v_max_i16_e32 v4, v4, v1 ; GFX8-NEXT: v_min_i16_e32 v5, 0, v2 ; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 -; GFX8-NEXT: v_sub_u16_e32 v4, s4, v4 +; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_max_i16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v3 @@ -2838,10 +2829,9 @@ ; GFX6-NEXT: v_min_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -2909,30 +2899,27 @@ ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s3, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: s_movk_i32 s2, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v3, s3, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, s2, v2 +; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_max_i16_e32 v3, s0, v3 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 ; GFX8-NEXT: v_max_i16_e32 v3, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v4, s3, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, s2, v3 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3 ; GFX8-NEXT: v_max_i16_e32 v4, s1, v4 ; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 @@ -3009,17 +2996,16 @@ ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3027,32 +3013,30 @@ ; GFX8-LABEL: v_saddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v7, s5, v7 -; GFX8-NEXT: v_sub_u16_e32 v6, s4, v6 +; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v2 ; GFX8-NEXT: v_min_i16_e32 v8, 0, v4 ; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 ; GFX8-NEXT: v_max_i16_e32 v7, 0, v4 -; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 +; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7 ; GFX8-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v8, 0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 ; GFX8-NEXT: v_max_i16_e32 v7, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 +; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7 ; GFX8-NEXT: v_max_i16_e32 v8, v8, v3 ; GFX8-NEXT: v_min_i16_e32 v9, 0, v5 ; GFX8-NEXT: v_min_i16_e32 v7, v8, v7 ; GFX8-NEXT: v_max_i16_e32 v8, 0, v5 -; GFX8-NEXT: v_sub_u16_e32 v9, s5, v9 -; GFX8-NEXT: v_sub_u16_e32 v8, s4, v8 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 ; GFX8-NEXT: v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v8 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v6 @@ -3294,24 +3278,23 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3319,56 +3302,52 @@ ; GFX8-LABEL: v_saddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v11, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v10, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v9, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v11, s5, v11 -; GFX8-NEXT: v_sub_u16_e32 v9, s4, v9 -; GFX8-NEXT: v_max_i16_e32 v11, v11, v3 -; GFX8-NEXT: v_min_i16_e32 v13, 0, v6 -; GFX8-NEXT: v_min_i16_e32 v9, v11, v9 -; GFX8-NEXT: v_max_i16_e32 v11, 0, v6 -; GFX8-NEXT: v_sub_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 -; GFX8-NEXT: v_max_i16_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v13, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v10, 0x8000, v10 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_max_i16_e32 v10, v10, v3 +; GFX8-NEXT: v_min_i16_e32 v11, 0, v6 +; GFX8-NEXT: v_min_i16_e32 v9, v10, v9 +; GFX8-NEXT: v_max_i16_e32 v10, 0, v6 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v11, 0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v11 -; GFX8-NEXT: v_max_i16_e32 v11, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v7 -; GFX8-NEXT: v_min_i16_e32 v11, v13, v11 -; GFX8-NEXT: v_max_i16_e32 v13, 0, v7 -; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 -; GFX8-NEXT: v_sub_u16_e32 v13, s4, v13 -; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff -; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 -; GFX8-NEXT: v_max_i16_e32 v13, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v14, v12, v14 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v10 +; GFX8-NEXT: v_max_i16_e32 v10, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v4 +; GFX8-NEXT: v_min_i16_e32 v12, 0, v7 +; GFX8-NEXT: v_min_i16_e32 v10, v11, v10 +; GFX8-NEXT: v_max_i16_e32 v11, 0, v7 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_max_i16_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v12, 0, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v13, v10, v13 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 -; GFX8-NEXT: v_min_i16_e32 v13, v14, v13 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v10, v10, v14 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v12, v12, v14 -; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v11 +; GFX8-NEXT: v_max_i16_e32 v11, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_max_i16_e32 v12, v12, v5 +; GFX8-NEXT: v_min_i16_e32 v13, 0, v8 +; GFX8-NEXT: v_min_i16_e32 v11, v12, v11 +; GFX8-NEXT: v_max_i16_e32 v12, 0, v8 +; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_max_i16_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 ; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v5, v5, v10 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v11 +; GFX8-NEXT: v_add_u16_e32 v1, v1, v10 ; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v13 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v11 ; GFX8-NEXT: v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3670,32 +3649,31 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3703,72 +3681,68 @@ ; GFX8-LABEL: v_saddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v13, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v12, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_sub_u16_e32 v12, s4, v12 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v4 -; GFX8-NEXT: v_min_i16_e32 v16, 0, v8 -; GFX8-NEXT: v_min_i16_e32 v12, v14, v12 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 -; GFX8-NEXT: v_max_i16_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v16, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 +; GFX8-NEXT: v_min_i16_e32 v14, 0, v8 +; GFX8-NEXT: v_min_i16_e32 v12, v13, v12 +; GFX8-NEXT: v_max_i16_e32 v13, 0, v8 +; GFX8-NEXT: v_sub_u16_e32 v14, 0x8000, v14 +; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13 +; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v14, 0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 -; GFX8-NEXT: v_max_i16_e32 v16, v16, v5 -; GFX8-NEXT: v_min_i16_e32 v17, 0, v9 -; GFX8-NEXT: v_min_i16_e32 v14, v16, v14 -; GFX8-NEXT: v_max_i16_e32 v16, 0, v9 -; GFX8-NEXT: v_sub_u16_e32 v17, s5, v17 -; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 -; GFX8-NEXT: v_sub_u16_e32 v16, s4, v16 -; GFX8-NEXT: v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v17, 0, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 +; GFX8-NEXT: v_max_i16_e32 v13, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v14, 0x8000, v14 +; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13 +; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 +; GFX8-NEXT: v_min_i16_e32 v15, 0, v9 +; GFX8-NEXT: v_min_i16_e32 v13, v14, v13 +; GFX8-NEXT: v_max_i16_e32 v14, 0, v9 +; GFX8-NEXT: v_sub_u16_e32 v15, 0x8000, v15 +; GFX8-NEXT: v_sub_u16_e32 v14, 0x7fff, v14 +; GFX8-NEXT: v_max_i16_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v15, 0, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff -; GFX8-NEXT: v_min_i16_e32 v5, v5, v16 -; GFX8-NEXT: v_max_i16_e32 v16, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v17, v15, v17 -; GFX8-NEXT: v_sub_u16_e32 v16, v13, v16 -; GFX8-NEXT: v_max_i16_e32 v17, v17, v6 -; GFX8-NEXT: v_min_i16_e32 v18, 0, v10 -; GFX8-NEXT: v_min_i16_e32 v16, v17, v16 -; GFX8-NEXT: v_max_i16_e32 v17, 0, v10 -; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 -; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 -; GFX8-NEXT: v_max_i16_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v18, 0, v3 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v17 -; GFX8-NEXT: v_max_i16_e32 v17, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v14 +; GFX8-NEXT: v_max_i16_e32 v14, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v15, 0x8000, v15 +; GFX8-NEXT: v_sub_u16_e32 v14, 0x7fff, v14 +; GFX8-NEXT: v_max_i16_e32 v15, v15, v6 +; GFX8-NEXT: v_min_i16_e32 v16, 0, v10 +; GFX8-NEXT: v_min_i16_e32 v14, v15, v14 +; GFX8-NEXT: v_max_i16_e32 v15, 0, v10 +; GFX8-NEXT: v_sub_u16_e32 v16, 0x8000, v16 +; GFX8-NEXT: v_sub_u16_e32 v15, 0x7fff, v15 +; GFX8-NEXT: v_max_i16_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v16, 0, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 -; GFX8-NEXT: v_max_i16_e32 v18, v18, v7 -; GFX8-NEXT: v_min_i16_e32 v17, v18, v17 -; GFX8-NEXT: v_max_i16_e32 v18, 0, v11 -; GFX8-NEXT: v_sub_u16_e32 v13, v13, v18 -; GFX8-NEXT: v_min_i16_e32 v18, 0, v11 -; GFX8-NEXT: v_sub_u16_e32 v15, v15, v18 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v15 +; GFX8-NEXT: v_max_i16_e32 v15, 0, v3 +; GFX8-NEXT: v_sub_u16_e32 v16, 0x8000, v16 +; GFX8-NEXT: v_sub_u16_e32 v15, 0x7fff, v15 +; GFX8-NEXT: v_max_i16_e32 v16, v16, v7 +; GFX8-NEXT: v_min_i16_e32 v17, 0, v11 +; GFX8-NEXT: v_min_i16_e32 v15, v16, v15 +; GFX8-NEXT: v_max_i16_e32 v16, 0, v11 +; GFX8-NEXT: v_sub_u16_e32 v17, 0x8000, v17 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v12 ; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v16, 0x7fff, v16 +; GFX8-NEXT: v_max_i16_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v14 +; GFX8-NEXT: v_add_u16_e32 v1, v1, v13 ; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v7, v7, v13 +; GFX8-NEXT: v_min_i16_e32 v7, v7, v16 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v16 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v14 ; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v17 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v15 ; GFX8-NEXT: v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -615,20 +615,19 @@ ; GISEL-LABEL: v_sdiv_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 +; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_xor_b32_e32 v4, v4, v6 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 @@ -679,73 +678,72 @@ ; CGP-LABEL: v_sdiv_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 +; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_xor_b32_e32 v4, v4, v6 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v4, v6 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v7 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_xor_b32_e32 v4, v5, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 +; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 ; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; CGP-NEXT: v_rcp_f32_e32 v6, v6 -; CGP-NEXT: v_rcp_f32_e32 v8, v8 -; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 -; CGP-NEXT: v_mul_lo_u32 v10, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_rcp_f32_e32 v5, v5 +; CGP-NEXT: v_rcp_f32_e32 v7, v7 +; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v9, v7 +; CGP-NEXT: v_mul_lo_u32 v10, 0, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v5, v6 ; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v6, v2 -; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v6 -; CGP-NEXT: v_mul_lo_u32 v10, v7, v3 -; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v0, v5 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v1, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_mul_lo_u32 v7, v5, v2 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; CGP-NEXT: v_mul_lo_u32 v10, v6, v3 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v6 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5] ; CGP-NEXT: v_sub_i32_e64 v9, s[6:7], v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v6 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v8 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i32> , %y %r = sdiv <2 x i32> %x, %shl.y @@ -756,9 +754,8 @@ ; GISEL-LABEL: v_sdiv_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -792,9 +789,8 @@ ; CGP-LABEL: v_sdiv_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 @@ -829,11 +825,10 @@ ; GISEL-LABEL: v_sdiv_v2i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 -; GISEL-NEXT: v_and_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_and_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 @@ -895,11 +890,10 @@ ; CGP-LABEL: v_sdiv_v2i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 -; CGP-NEXT: v_and_b32_e32 v3, s4, v3 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -1042,34 +1042,33 @@ ; CHECK-LABEL: v_sdiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_movk_i32 s5, 0xf000 -; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; CHECK-NEXT: s_movk_i32 s4, 0xf000 ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, s5, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, s5, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 ; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 @@ -1090,9 +1089,10 @@ ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s5, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s5, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: s_movk_i32 s4, 0x1000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -1469,34 +1469,34 @@ ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s6, 0x1000 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, s6 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s7, 0xf000 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x1000 +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; CGP-NEXT: s_movk_i32 s6, 0xf000 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v6 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v5 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 +; CGP-NEXT: v_mul_lo_u32 v8, s6, v6 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v5 +; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v9 ; CGP-NEXT: v_mul_lo_u32 v10, v5, v7 ; CGP-NEXT: v_mul_hi_u32 v11, v5, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1517,9 +1517,9 @@ ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v6 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v5 +; CGP-NEXT: v_mul_lo_u32 v8, s6, v6 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v9 @@ -1568,9 +1568,9 @@ ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v6 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v6 +; CGP-NEXT: v_mul_hi_u32 v11, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s7, v7 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 @@ -1596,7 +1596,7 @@ ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; CGP-NEXT: v_cvt_f32_u32_e32 v9, v5 +; CGP-NEXT: v_cvt_f32_u32_e32 v9, 0x1000 ; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc @@ -1614,9 +1614,9 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc ; CGP-NEXT: v_mul_lo_u32 v9, -1, v6 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v6 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v6 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v6 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -1645,9 +1645,9 @@ ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, -1, v6 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v6 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v6 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -1699,9 +1699,9 @@ ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v6 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v6 +; CGP-NEXT: v_mul_hi_u32 v11, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s7, v4 ; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1745,34 +1745,33 @@ ; CHECK-LABEL: v_sdiv_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_mov_b32 s5, 0xffed2705 -; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; CHECK-NEXT: s_mov_b32 s4, 0xffed2705 ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, s5, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, s5, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 ; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 @@ -1793,9 +1792,10 @@ ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s5, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s5, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -2172,34 +2172,34 @@ ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s6, 0x12d8fb -; CGP-NEXT: v_cvt_f32_u32_e32 v4, s6 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s7, 0xffed2705 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; CGP-NEXT: s_mov_b32 s6, 0xffed2705 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v6 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v5 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 +; CGP-NEXT: v_mul_lo_u32 v8, s6, v6 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v5 +; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v9 ; CGP-NEXT: v_mul_lo_u32 v10, v5, v7 ; CGP-NEXT: v_mul_hi_u32 v11, v5, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -2220,9 +2220,9 @@ ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v6 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v5 +; CGP-NEXT: v_mul_lo_u32 v8, s6, v6 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v9 @@ -2271,9 +2271,9 @@ ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v6 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v6 +; CGP-NEXT: v_mul_hi_u32 v11, s7, v7 +; CGP-NEXT: v_mul_lo_u32 v10, s7, v7 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 @@ -2299,7 +2299,7 @@ ; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; CGP-NEXT: v_cvt_f32_u32_e32 v9, v5 +; CGP-NEXT: v_cvt_f32_u32_e32 v9, 0x12d8fb ; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc @@ -2317,9 +2317,9 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc ; CGP-NEXT: v_mul_lo_u32 v9, -1, v6 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v6 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v6 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v6 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -2348,9 +2348,9 @@ ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, -1, v6 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v6 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v6 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -2402,9 +2402,9 @@ ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v6 -; CGP-NEXT: v_mul_hi_u32 v11, s6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v6 +; CGP-NEXT: v_mul_hi_u32 v11, s7, v4 +; CGP-NEXT: v_mul_lo_u32 v10, s7, v4 ; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -3298,11 +3298,10 @@ ; GISEL-LABEL: v_sdiv_i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v1, s4, v2 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -3326,10 +3325,9 @@ ; CGP-LABEL: v_sdiv_i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v1, s4, v2 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; CGP-NEXT: v_cvt_f32_i32_e32 v0, v0 ; CGP-NEXT: v_rcp_f32_e32 v2, v1 ; CGP-NEXT: v_mul_f32_e32 v2, v0, v2 @@ -3352,8 +3350,7 @@ ; GISEL-LABEL: v_sdiv_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s6, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v1, s6, v4 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 @@ -3362,8 +3359,8 @@ ; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_and_b32_e32 v5, s6, v0 -; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 +; GISEL-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v4, v4 @@ -3378,7 +3375,7 @@ ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v12 ; GISEL-NEXT: v_mul_lo_u32 v12, v4, v11 ; GISEL-NEXT: v_mul_lo_u32 v13, v7, v10 -; GISEL-NEXT: v_and_b32_e32 v0, s6, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 ; GISEL-NEXT: v_mul_hi_u32 v2, v7, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 @@ -3624,15 +3621,14 @@ ; CGP-LABEL: v_sdiv_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v1, s4, v4 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; CGP-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; CGP-NEXT: v_cvt_f32_i32_e32 v0, v0 -; CGP-NEXT: v_and_b32_e32 v4, s4, v6 +; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6 ; CGP-NEXT: v_rcp_f32_e32 v3, v1 ; CGP-NEXT: v_cvt_f32_i32_e32 v4, v4 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_i32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v3, v0, v3 ; CGP-NEXT: v_trunc_f32_e32 v3, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -875,7 +875,6 @@ ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f7ffffe ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s12, 31 @@ -887,113 +886,113 @@ ; GFX8-NEXT: s_add_i32 s0, s13, s16 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_xor_b32 s13, s0, s16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s13 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s13 ; GFX8-NEXT: s_ashr_i32 s12, s8, 31 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_add_i32 s0, s8, s12 ; GFX8-NEXT: s_xor_b32 s0, s0, s12 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: s_sub_i32 s8, 0, s13 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, v3, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_lo_u32 v3, v0, s3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s0, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 -; GFX8-NEXT: s_sub_i32 s0, 0, s13 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s0, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: s_xor_b32 s0, s12, s2 ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_add_i32 s1, s9, s2 -; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_xor_b32_e32 v3, s12, v3 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; GFX8-NEXT: v_xor_b32_e32 v2, s12, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX8-NEXT: s_ashr_i32 s3, s14, 31 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v1, s13 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s12, v3 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s13 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s12, v2 ; GFX8-NEXT: s_add_i32 s0, s14, s3 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v5 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX8-NEXT: s_xor_b32 s8, s0, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s8 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s8 -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s13, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX8-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 -; GFX8-NEXT: v_mul_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s13, v3 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 ; GFX8-NEXT: s_sub_i32 s0, 0, s8 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, s0, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_mul_lo_u32 v5, s0, v3 ; GFX8-NEXT: s_ashr_i32 s9, s10, 31 ; GFX8-NEXT: s_add_i32 s1, s10, s9 ; GFX8-NEXT: s_xor_b32 s1, s1, s9 -; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX8-NEXT: v_mul_hi_u32 v5, v3, v5 ; GFX8-NEXT: s_xor_b32 s0, s2, s16 -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 -; GFX8-NEXT: v_mul_hi_u32 v6, s1, v5 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s2, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s2, v2 ; GFX8-NEXT: s_ashr_i32 s2, s15, 31 -; GFX8-NEXT: v_mul_lo_u32 v7, v6, s8 +; GFX8-NEXT: v_mul_lo_u32 v6, v3, s8 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: s_add_i32 s0, s15, s2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v7 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v6 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 ; GFX8-NEXT: s_xor_b32 s10, s0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v7, s10 -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s8, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX8-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v6 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX8-NEXT: v_mul_f32_e32 v2, v7, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s8, v3 -; GFX8-NEXT: s_sub_i32 s0, 0, s10 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s10 +; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s8, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s0, v2 +; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s8, v2 +; GFX8-NEXT: s_sub_i32 s0, 0, s10 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, s0, v6 ; GFX8-NEXT: s_xor_b32 s0, s9, s3 ; GFX8-NEXT: s_ashr_i32 s3, s11, 31 ; GFX8-NEXT: s_add_i32 s1, s11, s3 -; GFX8-NEXT: v_mul_hi_u32 v7, v2, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v2 ; GFX8-NEXT: s_xor_b32 s1, s1, s3 -; GFX8-NEXT: v_xor_b32_e32 v6, s0, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, s1, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s9, v3 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v6 -; GFX8-NEXT: v_mul_lo_u32 v8, v7, s10 +; GFX8-NEXT: v_xor_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; GFX8-NEXT: v_mul_hi_u32 v8, s1, v2 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s9, v7 +; GFX8-NEXT: v_mul_lo_u32 v7, v8, s10 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s9, v3 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v8 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v7 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v8 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 @@ -1017,138 +1016,137 @@ ; GFX9-LABEL: sdivrem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f7ffffe ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s6, s12, 31 ; GFX9-NEXT: s_add_i32 s0, s12, s6 ; GFX9-NEXT: s_xor_b32 s7, s0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_ashr_i32 s5, s13, 31 -; GFX9-NEXT: s_add_i32 s12, s13, s5 +; GFX9-NEXT: s_ashr_i32 s4, s13, 31 +; GFX9-NEXT: s_add_i32 s5, s13, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s12, s12, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GFX9-NEXT: s_sub_i32 s13, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_ashr_i32 s4, s8, 31 -; GFX9-NEXT: s_add_i32 s8, s8, s4 -; GFX9-NEXT: v_mul_lo_u32 v3, s13, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_ashr_i32 s12, s8, 31 +; GFX9-NEXT: s_add_i32 s8, s8, s12 +; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s8, s8, s4 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: s_sub_i32 s16, 0, s12 +; GFX9-NEXT: s_xor_b32 s8, s8, s12 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: s_sub_i32 s13, 0, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, s13, v1 ; GFX9-NEXT: s_ashr_i32 s13, s9, 31 -; GFX9-NEXT: s_add_i32 s9, s9, s13 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 +; GFX9-NEXT: s_add_i32 s9, s9, s13 ; GFX9-NEXT: s_xor_b32 s9, s9, s13 -; GFX9-NEXT: s_xor_b32 s6, s4, s6 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s7 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: s_xor_b32 s5, s13, s5 -; GFX9-NEXT: v_sub_u32_e32 v4, s8, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v4 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s12 -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, s7, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v2 +; GFX9-NEXT: s_xor_b32 s6, s12, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 -; GFX9-NEXT: s_ashr_i32 s4, s14, 31 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: s_add_i32 s6, s14, s4 -; GFX9-NEXT: s_xor_b32 s6, s6, s4 -; GFX9-NEXT: v_sub_u32_e32 v3, s9, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_subrev_u32_e32 v6, s12, v3 +; GFX9-NEXT: s_ashr_i32 s6, s14, 31 +; GFX9-NEXT: s_add_i32 s7, s14, s6 +; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 +; GFX9-NEXT: s_xor_b32 s7, s7, s6 +; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s9, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: s_sub_i32 s8, 0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s8, v3 +; GFX9-NEXT: s_xor_b32 s4, s13, s4 +; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 +; GFX9-NEXT: s_ashr_i32 s4, s15, 31 +; GFX9-NEXT: s_add_i32 s9, s15, s4 +; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX9-NEXT: s_xor_b32 s9, s9, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s9 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: s_ashr_i32 s5, s10, 31 +; GFX9-NEXT: s_add_i32 s8, s10, s5 +; GFX9-NEXT: s_xor_b32 s8, s8, s5 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, s8, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s7 +; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s8, v6 +; GFX9-NEXT: s_sub_i32 s8, 0, s9 +; GFX9-NEXT: v_mul_lo_u32 v8, s8, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 -; GFX9-NEXT: s_sub_i32 s7, 0, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, s7, v5 -; GFX9-NEXT: v_xor_b32_e32 v1, s5, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s5, v1 -; GFX9-NEXT: s_ashr_i32 s5, s15, 31 -; GFX9-NEXT: s_add_i32 s9, s15, s5 -; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX9-NEXT: s_xor_b32 s9, s9, s5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s9 -; GFX9-NEXT: s_ashr_i32 s7, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: s_ashr_i32 s7, s11, 31 +; GFX9-NEXT: s_add_i32 s8, s11, s7 ; GFX9-NEXT: s_xor_b32 s8, s8, s7 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, s8, v5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GFX9-NEXT: v_subrev_u32_e32 v7, s12, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v6, s6 -; GFX9-NEXT: v_mul_f32_e32 v2, v8, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, s13, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v3 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v7 -; GFX9-NEXT: s_sub_i32 s8, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v8, s8, v2 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v6 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v6 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s6, v3 -; GFX9-NEXT: s_ashr_i32 s6, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s6 -; GFX9-NEXT: s_xor_b32 s8, s8, s6 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v8, s8, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX9-NEXT: v_mul_hi_u32 v7, s8, v7 +; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v7, s9 +; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 ; GFX9-NEXT: s_xor_b32 s4, s7, s4 -; GFX9-NEXT: v_xor_b32_e32 v3, s7, v3 -; GFX9-NEXT: v_mul_lo_u32 v7, v8, s9 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v6 -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v3 -; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v7 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v8 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 -; GFX9-NEXT: s_xor_b32 s4, s6, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 +; GFX9-NEXT: v_xor_b32_e32 v6, s5, v6 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_xor_b32_e32 v7, s6, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subrev_u32_e32 v7, s6, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] @@ -2473,8 +2471,7 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2483,7 +2480,7 @@ ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_short v[0:1], v4 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -2558,9 +2555,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -2570,7 +2566,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v3 ; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2884,13 +2880,12 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3225,51 +3220,51 @@ define amdgpu_kernel void @sdivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)* %out1, i27 %x, i27 %y) { ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s9, 0x7ffffff +; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s1, s1, 0x1b0000 -; GFX8-NEXT: s_ashr_i32 s2, s1, 31 -; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s3, s1, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX8-NEXT: s_sub_i32 s1, 0, s3 -; GFX8-NEXT: s_bfe_i32 s0, s0, 0x1b0000 -; GFX8-NEXT: s_ashr_i32 s8, s0, 31 +; GFX8-NEXT: s_bfe_i32 s0, s7, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s7, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_xor_b32 s8, s0, s7 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX8-NEXT: s_sub_i32 s0, 0, s8 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s0, s8 -; GFX8-NEXT: s_xor_b32 s0, s0, s8 -; GFX8-NEXT: s_xor_b32 s2, s8, s2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_bfe_i32 s4, s6, 0x1b0000 +; GFX8-NEXT: s_ashr_i32 s5, s4, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: s_add_i32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s4, s4, s5 +; GFX8-NEXT: s_xor_b32 s6, s5, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e64 v2, s[0:1], s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e64 v2, s[0:1], s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s8, v1 -; GFX8-NEXT: v_and_b32_e32 v3, s9, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_store_dword v[0:1], v3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_and_b32_e32 v2, s9, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s8 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -3293,8 +3288,7 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s5, s8, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x7ffffff +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 @@ -3310,14 +3304,14 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s5, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -401,10 +401,9 @@ ; GFX7-LABEL: v_shl_v2i64_zext_v2i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_brev_b32 s4, -4 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0x3fffffff, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], 2 @@ -413,10 +412,9 @@ ; GFX8-LABEL: v_shl_v2i64_zext_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -4 -; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x3fffffff, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] @@ -425,10 +423,9 @@ ; GFX9-LABEL: v_shl_v2i64_zext_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -4 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x3fffffff, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] @@ -485,9 +482,8 @@ ; GFX7-LABEL: v_shl_v2i64_sext_v2i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_brev_b32 s4, -8 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 @@ -497,9 +493,8 @@ ; GFX8-LABEL: v_shl_v2i64_sext_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -8 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -509,9 +504,8 @@ ; GFX9-LABEL: v_shl_v2i64_sext_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -8 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -659,13 +653,12 @@ ; GFX7-LABEL: v_shl_v2i32_zext_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -718,10 +718,9 @@ ; GFX6-LABEL: v_shl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -830,13 +829,12 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: shl_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -870,10 +868,9 @@ ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -916,21 +913,20 @@ ; GFX6-LABEL: v_shl_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1061,38 +1057,36 @@ ; GFX6-LABEL: v_shl_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v11 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v15 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, v2, v16 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, v8, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v5, v16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v4, v16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, v7, v16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, v6, v16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -568,19 +568,18 @@ ; GISEL-LABEL: v_srem_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 +; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v2 @@ -626,19 +625,18 @@ ; CGP-LABEL: v_srem_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 +; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 @@ -697,9 +695,8 @@ ; GISEL-LABEL: v_srem_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -730,9 +727,8 @@ ; CGP-LABEL: v_srem_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 @@ -765,11 +761,10 @@ ; GISEL-LABEL: v_srem_v2i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 -; GISEL-NEXT: v_and_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_and_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 @@ -825,11 +820,10 @@ ; CGP-LABEL: v_srem_v2i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 -; CGP-NEXT: v_and_b32_e32 v3, s4, v3 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1022,34 +1022,33 @@ ; CHECK-LABEL: v_srem_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_movk_i32 s5, 0xf000 -; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; CHECK-NEXT: s_movk_i32 s4, 0xf000 ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, s5, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, s5, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 ; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 @@ -1070,9 +1069,10 @@ ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s5, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s5, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: s_movk_i32 s4, 0x1000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -1445,34 +1445,34 @@ ; CGP-LABEL: v_srem_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s6, 0x1000 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, s6 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_movk_i32 s7, 0xf000 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x1000 +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; CGP-NEXT: s_movk_i32 s6, 0xf000 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v6 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v5 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 +; CGP-NEXT: v_mul_lo_u32 v8, s6, v6 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v5 +; CGP-NEXT: s_movk_i32 s7, 0x1000 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v9 ; CGP-NEXT: v_mul_lo_u32 v10, v5, v7 ; CGP-NEXT: v_mul_hi_u32 v11, v5, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1493,9 +1493,9 @@ ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v6 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v5 +; CGP-NEXT: v_mul_lo_u32 v8, s6, v6 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v9 @@ -1544,9 +1544,9 @@ ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v6, s6, v6 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v7, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v7 +; CGP-NEXT: v_mul_hi_u32 v7, s7, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 @@ -1569,7 +1569,7 @@ ; CGP-NEXT: v_sub_i32_e32 v10, vcc, v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cvt_f32_u32_e32 v9, v5 +; CGP-NEXT: v_cvt_f32_u32_e32 v9, 0x1000 ; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 @@ -1587,9 +1587,9 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CGP-NEXT: v_mul_lo_u32 v9, -1, v6 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v6 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v6 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -1618,9 +1618,9 @@ ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, -1, v6 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v6 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v6 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v6 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -1673,9 +1673,9 @@ ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v6, s6, v6 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v6, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v4 +; CGP-NEXT: v_mul_hi_u32 v4, s7, v4 ; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 @@ -1717,34 +1717,33 @@ ; CHECK-LABEL: v_srem_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: s_mov_b32 s5, 0xffed2705 -; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; CHECK-NEXT: s_mov_b32 s4, 0xffed2705 ; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v4 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v4 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, s5, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, s5, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 +; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 ; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 ; CHECK-NEXT: v_mul_hi_u32 v9, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 @@ -1765,9 +1764,10 @@ ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc ; CHECK-NEXT: v_mul_lo_u32 v5, -1, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, s5, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, s5, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, s4, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, s4, v2 +; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v7 @@ -2140,34 +2140,34 @@ ; CGP-LABEL: v_srem_v2i64_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s6, 0x12d8fb -; CGP-NEXT: v_cvt_f32_u32_e32 v4, s6 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: s_mov_b32 s7, 0xffed2705 -; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; CGP-NEXT: s_mov_b32 s6, 0xffed2705 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v6 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v5 ; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 +; CGP-NEXT: v_mul_lo_u32 v8, s6, v6 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v5 +; CGP-NEXT: s_mov_b32 s7, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v9 ; CGP-NEXT: v_mul_lo_u32 v10, v5, v7 ; CGP-NEXT: v_mul_hi_u32 v11, v5, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: s_bfe_i32 s8, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -2188,9 +2188,9 @@ ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; CGP-NEXT: v_mul_lo_u32 v7, -1, v5 -; CGP-NEXT: v_mul_lo_u32 v8, s7, v6 -; CGP-NEXT: v_mul_hi_u32 v10, s7, v5 -; CGP-NEXT: v_mul_lo_u32 v9, s7, v5 +; CGP-NEXT: v_mul_lo_u32 v8, s6, v6 +; CGP-NEXT: v_mul_hi_u32 v10, s6, v5 +; CGP-NEXT: v_mul_lo_u32 v9, s6, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v9 @@ -2239,9 +2239,9 @@ ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v7 -; CGP-NEXT: v_mul_lo_u32 v6, s6, v6 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v7 -; CGP-NEXT: v_mul_hi_u32 v7, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v6, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v7 +; CGP-NEXT: v_mul_hi_u32 v7, s7, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 @@ -2264,7 +2264,7 @@ ; CGP-NEXT: v_sub_i32_e32 v10, vcc, v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cvt_f32_u32_e32 v9, v5 +; CGP-NEXT: v_cvt_f32_u32_e32 v9, 0x12d8fb ; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 @@ -2282,9 +2282,9 @@ ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CGP-NEXT: v_mul_lo_u32 v9, -1, v6 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v6 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v6 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v6 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -2313,9 +2313,9 @@ ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc ; CGP-NEXT: v_mul_lo_u32 v9, -1, v6 -; CGP-NEXT: v_mul_lo_u32 v10, s7, v8 -; CGP-NEXT: v_mul_hi_u32 v12, s7, v6 -; CGP-NEXT: v_mul_lo_u32 v11, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v10, s6, v8 +; CGP-NEXT: v_mul_hi_u32 v12, s6, v6 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v6 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 @@ -2368,9 +2368,9 @@ ; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_mul_lo_u32 v8, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v6, s6, v6 -; CGP-NEXT: v_mul_lo_u32 v9, s6, v4 -; CGP-NEXT: v_mul_hi_u32 v4, s6, v4 +; CGP-NEXT: v_mul_lo_u32 v6, s7, v6 +; CGP-NEXT: v_mul_lo_u32 v9, s7, v4 +; CGP-NEXT: v_mul_hi_u32 v4, s7, v4 ; CGP-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 @@ -3246,11 +3246,10 @@ ; GISEL-LABEL: v_srem_i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v1, s4, v2 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -3272,10 +3271,9 @@ ; CGP-LABEL: v_srem_i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v1, s4, v2 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_i32_e32 v2, v1 -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; CGP-NEXT: v_cvt_f32_i32_e32 v3, v0 ; CGP-NEXT: v_rcp_f32_e32 v4, v2 ; CGP-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -3300,8 +3298,7 @@ ; GISEL-LABEL: v_srem_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s6, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v1, s6, v4 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 @@ -3310,8 +3307,8 @@ ; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_and_b32_e32 v5, s6, v0 -; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 +; GISEL-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v4, v4 @@ -3326,7 +3323,7 @@ ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v12 ; GISEL-NEXT: v_mul_lo_u32 v12, v4, v11 ; GISEL-NEXT: v_mul_lo_u32 v13, v7, v10 -; GISEL-NEXT: v_and_b32_e32 v0, s6, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 ; GISEL-NEXT: v_mul_hi_u32 v2, v7, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 @@ -3570,14 +3567,13 @@ ; CGP-LABEL: v_srem_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v1, s4, v4 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; CGP-NEXT: v_cvt_f32_i32_e32 v3, v1 -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; CGP-NEXT: v_cvt_f32_i32_e32 v4, v0 -; CGP-NEXT: v_and_b32_e32 v6, s4, v6 +; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; CGP-NEXT: v_rcp_f32_e32 v5, v3 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_mul_f32_e32 v5, v4, v5 ; CGP-NEXT: v_trunc_f32_e32 v5, v5 ; CGP-NEXT: v_mad_f32 v4, -v5, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -259,10 +259,9 @@ ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -273,21 +272,19 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 +; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_max_i16_e32 v1, v4, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 +; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 @@ -512,22 +509,21 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -540,40 +536,37 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 -; GFX8-NEXT: v_min_i16_e32 v10, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v10, s5, v10 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_e32 v1, v8, v1 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v10 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v9 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 +; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff ; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_sub_u16_e32 v4, v4, v9 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v6, s5, v6 +; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_max_i16_e32 v5, -1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_sub_u16_e32 v5, v5, v9 +; GFX8-NEXT: v_subrev_u16_e32 v5, 0x7fff, v5 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v4, v5, v4 @@ -619,7 +612,7 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -853,7 +846,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -2685,19 +2678,17 @@ ; GFX8-LABEL: v_ssubsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v3, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v3, s4, v3 +; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 ; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 +; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v3 @@ -2824,10 +2815,9 @@ ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -2895,30 +2885,27 @@ ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s2, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: s_movk_i32 s3, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v2, s2, v2 +; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, s3, v3 +; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 ; GFX8-NEXT: v_max_i16_e32 v3, -1, v1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_subrev_u16_e32 v3, s2, v3 +; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v4, s3, v4 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 ; GFX8-NEXT: v_max_i16_e32 v3, s1, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2 @@ -2995,17 +2982,16 @@ ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3013,32 +2999,30 @@ ; GFX8-LABEL: v_ssubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v6, s4, v6 +; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 ; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v7, s5, v7 +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v2 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 ; GFX8-NEXT: v_max_i16_e32 v7, -1, v4 -; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v4 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 ; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v7, -1, v1 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v8 -; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 ; GFX8-NEXT: v_min_i16_e32 v7, v7, v8 ; GFX8-NEXT: v_max_i16_e32 v8, -1, v5 -; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v5 -; GFX8-NEXT: v_subrev_u16_e32 v9, s5, v9 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v6 @@ -3280,24 +3264,23 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3305,56 +3288,52 @@ ; GFX8-LABEL: v_ssubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v9, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v9, s4, v9 -; GFX8-NEXT: v_min_i16_e32 v11, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_min_i16_e32 v10, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v11, s5, v11 +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10 ; GFX8-NEXT: v_max_i16_e32 v9, v9, v3 -; GFX8-NEXT: v_min_i16_e32 v9, v9, v11 -; GFX8-NEXT: v_max_i16_e32 v11, -1, v6 -; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 -; GFX8-NEXT: v_min_i16_e32 v13, -1, v6 -; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v11, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v13 -; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 -; GFX8-NEXT: v_min_i16_e32 v13, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v9, v9, v10 +; GFX8-NEXT: v_max_i16_e32 v10, -1, v6 +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_min_i16_e32 v11, -1, v6 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_max_i16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v10, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v11 +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_min_i16_e32 v11, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_max_i16_e32 v11, v11, v4 -; GFX8-NEXT: v_min_i16_e32 v11, v11, v13 -; GFX8-NEXT: v_max_i16_e32 v13, -1, v7 -; GFX8-NEXT: v_subrev_u16_e32 v13, s4, v13 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v7 -; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff -; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v13, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 -; GFX8-NEXT: v_sub_u16_e32 v13, v13, v10 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v2 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_max_i16_e32 v10, v10, v4 +; GFX8-NEXT: v_min_i16_e32 v10, v10, v11 +; GFX8-NEXT: v_max_i16_e32 v11, -1, v7 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_min_i16_e32 v12, -1, v7 +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v11, -1, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v12 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_min_i16_e32 v12, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v14, v14, v12 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v5 -; GFX8-NEXT: v_min_i16_e32 v13, v13, v14 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_sub_u16_e32 v10, v14, v10 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_sub_u16_e32 v12, v14, v12 -; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v5 +; GFX8-NEXT: v_min_i16_e32 v11, v11, v12 +; GFX8-NEXT: v_max_i16_e32 v12, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 ; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v13 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v11 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v10 ; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v13 +; GFX8-NEXT: v_sub_u16_e32 v2, v2, v11 ; GFX8-NEXT: v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3656,32 +3635,31 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3689,72 +3667,68 @@ ; GFX8-LABEL: v_ssubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v12, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 +; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 ; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 -; GFX8-NEXT: v_min_i16_e32 v12, v12, v14 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 -; GFX8-NEXT: v_min_i16_e32 v16, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v16 -; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 -; GFX8-NEXT: v_min_i16_e32 v16, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v12, v12, v13 +; GFX8-NEXT: v_max_i16_e32 v13, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v14, 0x8000, v14 +; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v13, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 +; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 -; GFX8-NEXT: v_min_i16_e32 v14, v14, v16 -; GFX8-NEXT: v_max_i16_e32 v16, -1, v9 -; GFX8-NEXT: v_subrev_u16_e32 v16, s4, v16 -; GFX8-NEXT: v_min_i16_e32 v17, -1, v9 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff -; GFX8-NEXT: v_subrev_u16_e32 v17, s5, v17 -; GFX8-NEXT: v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v16, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v17 -; GFX8-NEXT: v_sub_u16_e32 v16, v16, v13 -; GFX8-NEXT: v_min_i16_e32 v17, -1, v2 +; GFX8-NEXT: v_subrev_u16_e32 v14, 0x8000, v14 +; GFX8-NEXT: v_max_i16_e32 v13, v13, v5 +; GFX8-NEXT: v_min_i16_e32 v13, v13, v14 +; GFX8-NEXT: v_max_i16_e32 v14, -1, v9 +; GFX8-NEXT: v_subrev_u16_e32 v14, 0x7fff, v14 +; GFX8-NEXT: v_min_i16_e32 v15, -1, v9 +; GFX8-NEXT: v_subrev_u16_e32 v15, 0x8000, v15 +; GFX8-NEXT: v_max_i16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v14, -1, v2 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v15 +; GFX8-NEXT: v_subrev_u16_e32 v14, 0x7fff, v14 +; GFX8-NEXT: v_min_i16_e32 v15, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v15 -; GFX8-NEXT: v_max_i16_e32 v16, v16, v6 -; GFX8-NEXT: v_min_i16_e32 v16, v16, v17 -; GFX8-NEXT: v_max_i16_e32 v17, -1, v10 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 -; GFX8-NEXT: v_min_i16_e32 v18, -1, v10 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 -; GFX8-NEXT: v_max_i16_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v17, -1, v3 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v18 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 -; GFX8-NEXT: v_min_i16_e32 v18, -1, v3 +; GFX8-NEXT: v_subrev_u16_e32 v15, 0x8000, v15 +; GFX8-NEXT: v_max_i16_e32 v14, v14, v6 +; GFX8-NEXT: v_min_i16_e32 v14, v14, v15 +; GFX8-NEXT: v_max_i16_e32 v15, -1, v10 +; GFX8-NEXT: v_subrev_u16_e32 v15, 0x7fff, v15 +; GFX8-NEXT: v_min_i16_e32 v16, -1, v10 +; GFX8-NEXT: v_subrev_u16_e32 v16, 0x8000, v16 +; GFX8-NEXT: v_max_i16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v15, -1, v3 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v16 +; GFX8-NEXT: v_subrev_u16_e32 v15, 0x7fff, v15 +; GFX8-NEXT: v_min_i16_e32 v16, -1, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 -; GFX8-NEXT: v_max_i16_e32 v17, v17, v7 -; GFX8-NEXT: v_min_i16_e32 v17, v17, v18 -; GFX8-NEXT: v_max_i16_e32 v18, -1, v11 -; GFX8-NEXT: v_sub_u16_e32 v13, v18, v13 -; GFX8-NEXT: v_min_i16_e32 v18, -1, v11 +; GFX8-NEXT: v_subrev_u16_e32 v16, 0x8000, v16 +; GFX8-NEXT: v_max_i16_e32 v15, v15, v7 +; GFX8-NEXT: v_min_i16_e32 v15, v15, v16 +; GFX8-NEXT: v_max_i16_e32 v16, -1, v11 +; GFX8-NEXT: v_subrev_u16_e32 v16, 0x7fff, v16 +; GFX8-NEXT: v_min_i16_e32 v17, -1, v11 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 ; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_sub_u16_e32 v15, v18, v15 -; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_subrev_u16_e32 v17, 0x8000, v17 +; GFX8-NEXT: v_max_i16_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v14 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v13 ; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v7, v7, v15 +; GFX8-NEXT: v_min_i16_e32 v7, v7, v17 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v16 +; GFX8-NEXT: v_sub_u16_e32 v2, v2, v14 ; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v17 +; GFX8-NEXT: v_sub_u16_e32 v3, v3, v15 ; GFX8-NEXT: v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll @@ -80,11 +80,10 @@ ; GFX7-LABEL: v_usubo_i8: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -93,11 +92,10 @@ ; GFX8-LABEL: v_usubo_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -123,11 +121,10 @@ ; GFX7-LABEL: v_usubo_i7: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0x7f -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -136,11 +133,10 @@ ; GFX8-LABEL: v_usubo_i7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -149,11 +145,10 @@ ; GFX9-LABEL: v_usubo_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -576,11 +571,10 @@ ; GFX7-LABEL: s_usubo_i8: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -589,11 +583,10 @@ ; GFX8-LABEL: s_usubo_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -619,11 +612,10 @@ ; GFX7-LABEL: s_usubo_i7: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0x7f -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -632,11 +624,10 @@ ; GFX8-LABEL: s_usubo_i7: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -645,11 +636,10 @@ ; GFX9-LABEL: s_usubo_i7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -966,11 +956,10 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) { ; GFX7-LABEL: usubo_i16_sv: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -979,11 +968,10 @@ ; ; GFX8-LABEL: usubo_i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s1, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll @@ -138,12 +138,11 @@ ; GFX7-LABEL: v_trunc_v4i32_to_v4i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -452,7 +452,7 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -617,7 +617,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -308,9 +308,8 @@ ; GISEL-LABEL: v_udiv_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -354,9 +353,8 @@ ; CGP-LABEL: v_udiv_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 +; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -413,9 +411,8 @@ ; GISEL-LABEL: v_udiv_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -440,9 +437,8 @@ ; CGP-LABEL: v_udiv_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 @@ -477,11 +473,10 @@ ; GISEL-LABEL: v_udiv_v2i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 -; GISEL-NEXT: v_and_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_and_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -525,11 +520,10 @@ ; CGP-LABEL: v_udiv_v2i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 -; CGP-NEXT: v_and_b32_e32 v3, s4, v3 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -1838,9 +1838,8 @@ ; GISEL-LABEL: v_udiv_i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -1866,9 +1865,8 @@ ; CGP-LABEL: v_udiv_i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v2 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v1 @@ -1892,19 +1890,18 @@ ; GISEL-LABEL: v_udiv_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s6, 0xffffff -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; GISEL-NEXT: v_and_b32_e32 v1, s6, v4 -; GISEL-NEXT: v_and_b32_e32 v3, s6, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v1 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1 ; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 @@ -1916,17 +1913,17 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 ; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 @@ -1936,56 +1933,56 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v18 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s6, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s8, -1, 0x10000 -; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 -; GISEL-NEXT: v_and_b32_e32 v2, s6, v2 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mov_b32_e32 v16, s4 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v18, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_mov_b32_e32 v15, s4 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v20, v18 ; GISEL-NEXT: v_mov_b32_e32 v19, s5 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_mov_b32_e32 v15, s7 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v16 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_mov_b32_e32 v16, s6 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v2 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 @@ -1994,7 +1991,7 @@ ; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 ; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v5, v2, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -2007,12 +2004,12 @@ ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 ; GISEL-NEXT: v_mul_lo_u32 v14, v8, v4 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v9 @@ -2029,7 +2026,7 @@ ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mov_b32_e32 v18, s8 +; GISEL-NEXT: v_mov_b32_e32 v18, s7 ; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 ; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 @@ -2040,122 +2037,121 @@ ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v4 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v17, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v0, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v13, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v14 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v1, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v1, v2 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v13, v1, v2 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v11, v3, v7 ; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v17, v3, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v10 ; GISEL-NEXT: v_mul_lo_u32 v9, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v5 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 ; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], 1, v7 -; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v6, s[6:7] -; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v11 +; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v5, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v0, v11 ; GISEL-NEXT: v_subb_u32_e64 v11, s[8:9], 0, v10, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v13, v16, v13, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v13, v15, v13, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v15, v16, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v16, v15, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], 1, v12 ; GISEL-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v14, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v10 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[6:7] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v3 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 1, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 1, v8 ; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], 0, v17, s[6:7] ; GISEL-NEXT: v_sub_i32_e64 v9, s[6:7], 0, v9 ; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v19, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v8, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v8, v0, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v16, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v6, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s6, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v1, s6, v2 -; CGP-NEXT: v_and_b32_e32 v2, s6, v4 -; CGP-NEXT: v_and_b32_e32 v3, s6, v6 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1 @@ -2176,8 +2172,8 @@ ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v2, s6, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v1 ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -715,7 +715,6 @@ ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f7ffffe ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 @@ -726,75 +725,75 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_lo_u32 v3, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX8-NEXT: s_sub_i32 s0, 0, s13 -; GFX8-NEXT: v_mul_lo_u32 v4, s0, v1 -; GFX8-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX8-NEXT: v_mul_lo_u32 v3, v0, s12 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s12 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_mul_lo_u32 v5, v1, s13 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s12, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s12, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v5 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v5, v6 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 -; GFX8-NEXT: v_mul_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s13, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s12, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s12, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v6 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s9, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 ; GFX8-NEXT: s_sub_i32 s0, 0, s14 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s15 -; GFX8-NEXT: v_mul_lo_u32 v7, s0, v5 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX8-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX8-NEXT: v_mul_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 -; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_hi_u32 v7, s10, v5 -; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v3 +; GFX8-NEXT: v_mul_lo_u32 v6, s0, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, v3, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s15 +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v5 +; GFX8-NEXT: v_mul_hi_u32 v3, s10, v3 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 +; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX8-NEXT: s_sub_i32 s0, 0, s15 -; GFX8-NEXT: v_mul_lo_u32 v6, s0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v7, s14 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 -; GFX8-NEXT: v_mul_hi_u32 v6, v2, v6 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s10, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s14, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v2, v5, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, v3, s14 +; GFX8-NEXT: v_mul_lo_u32 v7, s0, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v3 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s10, v2 +; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s11, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v7 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, v8, s15 -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s14, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s11, v7 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v8 +; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s14, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v2, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v7 +; GFX8-NEXT: v_mul_hi_u32 v7, s11, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s14, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, s15 +; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s14, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s11, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s15, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s15, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v7 @@ -814,7 +813,6 @@ ; GFX9-LABEL: udivrem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f7ffffe ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -822,85 +820,85 @@ ; GFX9-NEXT: s_sub_i32 s1, 0, s13 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s14 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s14 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_lo_u32 v3, s0, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s1, v1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 ; GFX9-NEXT: s_sub_i32 s4, 0, s14 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s12 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s13 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v4, s8, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s12, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v6, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, s9, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s12, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s15 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s12 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s13 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 +; GFX9-NEXT: v_sub_u32_e32 v7, s9, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, s4, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s15 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GFX9-NEXT: v_subrev_u32_e32 v6, s13, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v5 +; GFX9-NEXT: v_mul_hi_u32 v2, s10, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v8, s13, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, s4, v3 +; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v6 ; GFX9-NEXT: s_sub_i32 s4, 0, s15 -; GFX9-NEXT: v_add_u32_e32 v8, 1, v1 -; GFX9-NEXT: v_mul_f32_e32 v2, v7, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v6 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s4, v2 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v3, s10, v3 -; GFX9-NEXT: v_subrev_u32_e32 v8, s13, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s14 -; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, s11, v2 -; GFX9-NEXT: v_sub_u32_e32 v6, s10, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v2, s14 +; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, s10, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v3, v8 +; GFX9-NEXT: v_add_u32_e32 v8, 1, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s14, v6 -; GFX9-NEXT: v_mul_lo_u32 v8, v7, s15 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc -; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 +; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v7, s14, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 +; GFX9-NEXT: v_mul_lo_u32 v8, v3, s15 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s14, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v3, vcc -; GFX9-NEXT: v_sub_u32_e32 v3, s11, v8 -; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GFX9-NEXT: v_subrev_u32_e32 v7, s14, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8 +; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v3, 1, v7 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s15, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -2003,16 +2001,15 @@ ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_short v[0:1], v4 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -2047,7 +2044,6 @@ ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 @@ -2070,7 +2066,7 @@ ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -2078,7 +2074,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v1, v0, s[2:3] @@ -2269,7 +2265,6 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2277,7 +2272,7 @@ ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX8-NEXT: s_sub_i32 s1, 0, s2 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_lshr_b32 s9, s0, 16 +; GFX8-NEXT: s_lshr_b32 s8, s0, 16 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -2292,7 +2287,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 @@ -2306,7 +2301,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -2316,11 +2311,11 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 -; GFX8-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, s8, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -2592,7 +2587,6 @@ ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX8-NEXT: s_and_b32 s4, s6, 0x7ffffff -; GFX8-NEXT: s_mov_b32 s5, 0x7ffffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 @@ -2610,11 +2604,11 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_and_b32_e32 v2, s5, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_and_b32_e32 v2, s5, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2633,7 +2627,6 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s4, 0x7ffffff ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 @@ -2649,10 +2642,10 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -117,14 +117,13 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GISEL-NEXT: s_mov_b32 s4, 0x4f7ffffe ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v4, s4, v4 -; GISEL-NEXT: v_mul_f32_e32 v6, s4, v6 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 @@ -217,9 +216,8 @@ ; CHECK-LABEL: v_urem_v2i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0xfff -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; CHECK-NEXT: v_and_b32_e32 v1, s4, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 0xfff, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0xfff, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i32> %num, ret <2 x i32> %result @@ -257,14 +255,14 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s4, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v2, 0x12d8fb -; GISEL-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v3, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffed2705 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v3 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v0, v3 ; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 ; GISEL-NEXT: v_mul_lo_u32 v4, v4, s4 @@ -351,18 +349,16 @@ ; GISEL-LABEL: v_urem_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: s_mov_b32 s5, 0x4f7ffffe -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 -; GISEL-NEXT: v_mul_f32_e32 v6, s5, v6 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 @@ -394,9 +390,8 @@ ; CGP-LABEL: v_urem_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 +; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -449,9 +444,8 @@ ; GISEL-LABEL: v_urem_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -474,9 +468,8 @@ ; CGP-LABEL: v_urem_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 @@ -509,20 +502,18 @@ ; GISEL-LABEL: v_urem_v2i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: s_mov_b32 s5, 0x4f7ffffe -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 -; GISEL-NEXT: v_and_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_and_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 -; GISEL-NEXT: v_mul_f32_e32 v6, s5, v6 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 @@ -554,11 +545,10 @@ ; CGP-LABEL: v_urem_v2i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 -; CGP-NEXT: v_and_b32_e32 v3, s4, v3 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -951,9 +951,8 @@ ; CHECK-LABEL: v_urem_v2i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0xfff -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; CHECK-NEXT: v_and_b32_e32 v2, s4, v2 +; CHECK-NEXT: v_and_b32_e32 v0, 0xfff, v0 +; CHECK-NEXT: v_and_b32_e32 v2, 0xfff, v2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -967,34 +966,34 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s4, 0x12d8fb ; CHECK-NEXT: v_mov_b32_e32 v2, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 ; CHECK-NEXT: s_mov_b32 s5, 0xffed2705 ; CHECK-NEXT: s_bfe_i32 s6, -1, 0x10000 ; CHECK-NEXT: s_bfe_i32 s7, -1, 0x10000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v4, s4 -; CHECK-NEXT: v_mov_b32_e32 v5, s6 -; CHECK-NEXT: v_mov_b32_e32 v6, s7 -; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v4 +; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v5, s7 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 +; CHECK-NEXT: v_trunc_f32_e32 v6, v6 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, s5, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, s5, v6 ; CHECK-NEXT: v_mul_lo_u32 v8, s5, v3 ; CHECK-NEXT: v_mul_lo_u32 v9, -1, v3 ; CHECK-NEXT: v_mul_hi_u32 v10, s5, v3 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_mul_lo_u32 v9, v4, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v8 ; CHECK-NEXT: v_mul_hi_u32 v11, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v4, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; CHECK-NEXT: v_mul_lo_u32 v10, v3, v7 -; CHECK-NEXT: v_mul_lo_u32 v12, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v12, v6, v7 ; CHECK-NEXT: v_mul_hi_u32 v13, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v6, v7 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v12, v8 @@ -1010,20 +1009,20 @@ ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc ; CHECK-NEXT: v_mul_lo_u32 v7, s5, v3 ; CHECK-NEXT: v_mul_lo_u32 v8, -1, v3 ; CHECK-NEXT: v_mul_hi_u32 v9, s5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, s5, v4 -; CHECK-NEXT: v_mul_lo_u32 v11, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v10, s5, v6 +; CHECK-NEXT: v_mul_lo_u32 v11, v6, v7 ; CHECK-NEXT: v_mul_hi_u32 v12, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v6, v7 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CHECK-NEXT: v_mul_lo_u32 v9, v3, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v4, v8 +; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8 ; CHECK-NEXT: v_mul_hi_u32 v13, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v4, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 @@ -1039,14 +1038,14 @@ ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v8, vcc +; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc ; CHECK-NEXT: v_mul_lo_u32 v7, v1, v3 ; CHECK-NEXT: v_mul_hi_u32 v8, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v0, v4 -; CHECK-NEXT: v_mul_lo_u32 v10, v1, v4 -; CHECK-NEXT: v_mul_hi_u32 v11, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v9, v0, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v0, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 @@ -1063,32 +1062,32 @@ ; CHECK-NEXT: v_mul_lo_u32 v8, s4, v3 ; CHECK-NEXT: v_mul_lo_u32 v9, 0, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, s4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v4, s4, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_lo_u32 v6, s4, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v1, v3, vcc +; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v3, vcc ; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v3 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, 1235195 ret i64 %result @@ -1100,62 +1099,62 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 ; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, s8 +; GISEL-NEXT: v_madmk_f32 v6, v5, 0x4f800000, v7 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: v_madmk_f32 v9, v7, 0x4f800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GISEL-NEXT: v_mov_b32_e32 v6, s4 ; GISEL-NEXT: v_mov_b32_e32 v5, s5 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GISEL-NEXT: s_sub_u32 s9, 0, 0x12d8fb -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 ; GISEL-NEXT: s_subb_u32 s10, 0, 0 ; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 ; GISEL-NEXT: v_trunc_f32_e32 v10, v10 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v10 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_lo_u32 v11, s6, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v12, s9, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, s6, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, s6, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, s9, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, s10, v8 -; GISEL-NEXT: v_mul_hi_u32 v18, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, s6, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, s7, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, s6, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, s9, v7 +; GISEL-NEXT: v_mul_lo_u32 v17, s10, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, s9, v7 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_mul_lo_u32 v14, v9, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v8, v13 ; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 ; GISEL-NEXT: v_mul_lo_u32 v17, v10, v16 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v8, v16 +; GISEL-NEXT: v_mul_hi_u32 v15, v7, v16 ; GISEL-NEXT: v_mul_hi_u32 v16, v10, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v12 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v11 +; GISEL-NEXT: v_mul_lo_u32 v15, v8, v11 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v11 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 @@ -1167,7 +1166,7 @@ ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc @@ -1182,38 +1181,38 @@ ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, s7, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, s6, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v16 +; GISEL-NEXT: v_mul_lo_u32 v11, s6, v8 +; GISEL-NEXT: v_mul_lo_u32 v13, s7, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, s6, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, s9, v8 -; GISEL-NEXT: v_mul_lo_u32 v15, s10, v8 -; GISEL-NEXT: v_mul_hi_u32 v16, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v7 +; GISEL-NEXT: v_mul_lo_u32 v15, s10, v7 +; GISEL-NEXT: v_mul_hi_u32 v16, s9, v7 ; GISEL-NEXT: v_mul_lo_u32 v17, s6, v9 ; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v19, v8, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; GISEL-NEXT: v_mul_lo_u32 v17, s9, v10 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 ; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v15 +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v15 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 ; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 ; GISEL-NEXT: v_mul_lo_u32 v19, v10, v15 @@ -1224,7 +1223,7 @@ ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v8, v15 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v15 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 @@ -1242,16 +1241,16 @@ ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v8 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v8 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 ; GISEL-NEXT: v_mul_lo_u32 v15, v0, v9 ; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 ; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 @@ -1266,55 +1265,55 @@ ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; GISEL-NEXT: v_mul_hi_u32 v11, v2, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 -; GISEL-NEXT: v_add_i32_e64 v7, s[6:7], v16, v7 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v16, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v13, v8 +; GISEL-NEXT: v_add_i32_e64 v7, s[6:7], v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_lo_u32 v14, s8, v7 -; GISEL-NEXT: v_mul_lo_u32 v15, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_mul_lo_u32 v12, s8, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v8 +; GISEL-NEXT: v_mul_lo_u32 v14, s8, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, 0, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, s8, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, s8, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9 ; GISEL-NEXT: v_mul_lo_u32 v10, s8, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v12 -; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v8 +; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v7, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v7 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -1351,62 +1350,62 @@ ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_mov_b32 s8, 0x12d8fb ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 ; CGP-NEXT: s_mov_b32 s6, 0xffed2705 ; CGP-NEXT: s_bfe_i32 s4, -1, 0x10000 ; CGP-NEXT: s_bfe_i32 s5, -1, 0x10000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; CGP-NEXT: v_cvt_f32_u32_e32 v7, 0x12d8fb +; CGP-NEXT: v_cvt_f32_ubyte0_e32 v8, 0 ; CGP-NEXT: s_bfe_i32 s7, -1, 0x10000 ; CGP-NEXT: s_bfe_i32 s9, -1, 0x10000 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, s8 -; CGP-NEXT: v_mov_b32_e32 v8, s4 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_mov_b32_e32 v6, s4 ; CGP-NEXT: v_mov_b32_e32 v9, s5 -; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v5 -; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v10 +; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 -; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 +; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v5 +; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v7 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 ; CGP-NEXT: v_trunc_f32_e32 v10, v10 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v10 ; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v11, s6, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_mul_lo_u32 v11, s6, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 ; CGP-NEXT: v_mul_lo_u32 v12, s6, v10 ; CGP-NEXT: v_mul_lo_u32 v13, s6, v5 ; CGP-NEXT: v_mul_lo_u32 v14, -1, v5 ; CGP-NEXT: v_mul_hi_u32 v15, s6, v5 -; CGP-NEXT: v_mul_lo_u32 v16, s6, v6 -; CGP-NEXT: v_mul_lo_u32 v17, -1, v6 -; CGP-NEXT: v_mul_hi_u32 v18, s6, v6 +; CGP-NEXT: v_mul_lo_u32 v16, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v17, -1, v7 +; CGP-NEXT: v_mul_hi_u32 v18, s6, v7 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v8, v13 ; CGP-NEXT: v_mul_hi_u32 v19, v5, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 ; CGP-NEXT: v_mul_lo_u32 v17, v10, v16 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v6, v16 +; CGP-NEXT: v_mul_hi_u32 v15, v7, v16 ; CGP-NEXT: v_mul_hi_u32 v16, v10, v16 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; CGP-NEXT: v_mul_lo_u32 v18, v6, v12 +; CGP-NEXT: v_mul_lo_u32 v18, v7, v12 ; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; CGP-NEXT: v_mul_lo_u32 v15, v5, v11 -; CGP-NEXT: v_mul_lo_u32 v17, v7, v11 +; CGP-NEXT: v_mul_lo_u32 v17, v8, v11 ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 ; CGP-NEXT: v_mul_hi_u32 v14, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 ; CGP-NEXT: v_mul_lo_u32 v19, v10, v12 @@ -1417,7 +1416,7 @@ ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_mul_hi_u32 v18, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v18, v7, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc @@ -1433,33 +1432,33 @@ ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc ; CGP-NEXT: v_mul_lo_u32 v11, s6, v5 ; CGP-NEXT: v_mul_lo_u32 v13, -1, v5 ; CGP-NEXT: v_mul_hi_u32 v14, s6, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v16 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v16 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v12, s6, v6 -; CGP-NEXT: v_mul_lo_u32 v15, -1, v6 -; CGP-NEXT: v_mul_hi_u32 v16, s6, v6 -; CGP-NEXT: v_mul_lo_u32 v17, s6, v7 -; CGP-NEXT: v_mul_lo_u32 v18, v7, v11 +; CGP-NEXT: v_mul_lo_u32 v12, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v15, -1, v7 +; CGP-NEXT: v_mul_hi_u32 v16, s6, v7 +; CGP-NEXT: v_mul_lo_u32 v17, s6, v8 +; CGP-NEXT: v_mul_lo_u32 v18, v8, v11 ; CGP-NEXT: v_mul_hi_u32 v19, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; CGP-NEXT: v_mul_lo_u32 v17, s6, v10 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 ; CGP-NEXT: v_mul_lo_u32 v17, v10, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v10, v12 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; CGP-NEXT: v_mul_lo_u32 v16, v6, v15 +; CGP-NEXT: v_mul_lo_u32 v16, v7, v15 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; CGP-NEXT: v_mul_lo_u32 v14, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v16, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v16, v8, v13 ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 @@ -1474,7 +1473,7 @@ ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v17, v6, v15 +; CGP-NEXT: v_mul_hi_u32 v17, v7, v15 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v19, v12 ; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 @@ -1485,7 +1484,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v18 ; CGP-NEXT: v_mov_b32_e32 v18, s9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc @@ -1493,19 +1492,19 @@ ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v16 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v13, vcc +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc ; CGP-NEXT: v_mul_lo_u32 v11, v1, v5 ; CGP-NEXT: v_mul_hi_u32 v13, v0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v14, v2, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v16, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_mul_lo_u32 v15, v0, v8 +; CGP-NEXT: v_mul_lo_u32 v16, v1, v8 +; CGP-NEXT: v_mul_hi_u32 v17, v0, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v1, v8 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 @@ -1518,57 +1517,57 @@ ; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 ; CGP-NEXT: v_add_i32_e64 v5, s[6:7], v16, v5 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; CGP-NEXT: v_add_i32_e64 v6, s[6:7], v13, v6 +; CGP-NEXT: v_add_i32_e64 v7, s[6:7], v13, v7 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_mul_lo_u32 v14, s8, v5 ; CGP-NEXT: v_mul_lo_u32 v15, 0, v5 ; CGP-NEXT: v_mul_hi_u32 v5, s8, v5 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_mul_lo_u32 v12, s8, v6 -; CGP-NEXT: v_mul_lo_u32 v16, 0, v6 -; CGP-NEXT: v_mul_hi_u32 v6, s8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v12, s8, v7 +; CGP-NEXT: v_mul_lo_u32 v16, 0, v7 +; CGP-NEXT: v_mul_hi_u32 v7, s8, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_mul_lo_u32 v7, s8, v7 +; CGP-NEXT: v_mul_lo_u32 v8, s8, v8 ; CGP-NEXT: v_mul_lo_u32 v10, s8, v10 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v16, v10 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v5, vcc +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v5, vcc ; CGP-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v12 -; CGP-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v6, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v6 +; CGP-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v7, s[4:5] +; CGP-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v7 ; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7] -; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; CGP-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[6:7] +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] +; CGP-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8 +; CGP-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[6:7] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v6, v19, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v6, v19, v7, vcc ; CGP-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v4 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; CGP-NEXT: v_sub_i32_e32 v12, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc @@ -1576,23 +1575,23 @@ ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; CGP-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v8, v4 +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v7, v4 ; CGP-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 ; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 ; CGP-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] ; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[4:5] ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, @@ -2367,9 +2366,8 @@ ; GISEL-LABEL: v_urem_i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -2393,9 +2391,8 @@ ; CGP-LABEL: v_urem_i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v2 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 ; CGP-NEXT: v_rcp_f32_e32 v4, v3 @@ -2421,19 +2418,18 @@ ; GISEL-LABEL: v_urem_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s6, 0xffffff -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; GISEL-NEXT: v_and_b32_e32 v3, s6, v4 -; GISEL-NEXT: v_and_b32_e32 v1, s6, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3 ; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 @@ -2445,17 +2441,17 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 ; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 @@ -2465,56 +2461,56 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v18 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s6, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s8, -1, 0x10000 -; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 -; GISEL-NEXT: v_and_b32_e32 v2, s6, v2 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mov_b32_e32 v16, s4 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v18, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_mov_b32_e32 v15, s4 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v20, v18 ; GISEL-NEXT: v_mov_b32_e32 v19, s5 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_mov_b32_e32 v15, s7 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v16 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_mov_b32_e32 v16, s6 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v2 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 @@ -2523,7 +2519,7 @@ ; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 ; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v5, v2, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -2536,12 +2532,12 @@ ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 ; GISEL-NEXT: v_mul_lo_u32 v14, v8, v4 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v9 @@ -2558,7 +2554,7 @@ ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mov_b32_e32 v18, s8 +; GISEL-NEXT: v_mov_b32_e32 v18, s7 ; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 ; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 @@ -2569,118 +2565,117 @@ ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v4 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v17, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v0, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v13, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v14 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v2 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v10 ; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11 -; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v5, s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v16, v7, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v15, v9, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v0, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v5, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v8, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v15, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s6, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v1, s6, v2 -; CGP-NEXT: v_and_b32_e32 v2, s6, v4 -; CGP-NEXT: v_and_b32_e32 v3, s6, v6 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1 @@ -2705,8 +2700,8 @@ ; CGP-NEXT: v_mul_lo_u32 v3, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v2, s6, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v1 ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -440,7 +440,7 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -601,7 +601,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -205,8 +205,7 @@ ; GFX9PLUS: global_load_dword [[B:v[0-9]+]] ; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] -; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]] -; GFX10-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] +; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] ; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9PLUS: buffer_store_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1177,7 +1177,6 @@ ; GFX6-LABEL: udiv_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s15, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, -1 @@ -1189,9 +1188,9 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s11 -; GFX6-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX6-NEXT: s_sub_i32 s2, 0, s9 @@ -1216,7 +1215,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, s3, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 @@ -1232,7 +1231,7 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX6-NEXT: v_mul_f32_e32 v4, s3, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v2, s10 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 @@ -1264,9 +1263,8 @@ ; GFX9-LABEL: udiv_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1275,75 +1273,75 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 -; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: s_sub_i32 s2, 0, s10 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, s12, v5 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v5 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, s9 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, s2, v2 +; GFX9-NEXT: s_sub_i32 s2, 0, s11 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, s9 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX9-NEXT: v_add_u32_e32 v8, 1, v1 +; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 -; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 -; GFX9-NEXT: v_mul_f32_e32 v2, s12, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_sub_i32 s2, 0, s11 -; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v6 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, s2, v6 +; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, v2, s10 +; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 -; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 -; GFX9-NEXT: v_mul_lo_u32 v8, v3, s10 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 +; GFX9-NEXT: v_sub_u32_e32 v5, s6, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, s7, v2 -; GFX9-NEXT: v_sub_u32_e32 v6, s6, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s10, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v5, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 +; GFX9-NEXT: v_subrev_u32_e32 v6, s10, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s11 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v6 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 +; GFX9-NEXT: v_sub_u32_e32 v5, s7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v3 +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i32> %x, %y @@ -1479,26 +1477,25 @@ ; GFX6-LABEL: urem_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_i32 s2, 0, s8 -; GFX6-NEXT: s_sub_i32 s12, 0, s9 +; GFX6-NEXT: s_sub_i32 s12, 0, s8 +; GFX6-NEXT: s_sub_i32 s13, 0, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 -; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 @@ -1506,7 +1503,7 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -1526,7 +1523,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: s_sub_i32 s4, 0, s11 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 @@ -1558,53 +1555,52 @@ ; GFX9-LABEL: urem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_sub_i32 s2, 0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_sub_i32 s3, 0, s9 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX9-NEXT: s_sub_i32 s2, 0, s10 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, s12, v5 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v1 +; GFX9-NEXT: s_sub_i32 s2, 0, s10 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s11 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX9-NEXT: s_sub_i32 s2, 0, s11 -; GFX9-NEXT: v_mul_f32_e32 v3, s12, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 ; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 @@ -1809,7 +1805,6 @@ ; GFX6-LABEL: sdiv_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s16, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s15, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, -1 @@ -1825,13 +1820,13 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: s_sub_i32 s1, 0, s3 ; GFX6-NEXT: s_ashr_i32 s0, s4, 31 -; GFX6-NEXT: v_mul_f32_e32 v0, s16, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: s_xor_b32 s2, s0, s2 ; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 ; GFX6-NEXT: s_add_i32 s1, s4, s0 -; GFX6-NEXT: v_mul_f32_e32 v1, s16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -1865,7 +1860,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, s16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 @@ -1893,7 +1888,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v2, s4 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; GFX6-NEXT: v_mul_f32_e32 v4, s16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 @@ -1932,9 +1927,8 @@ ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s15, 0x4f7ffffe -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s8, 31 ; GFX9-NEXT: s_add_i32 s3, s8, s2 @@ -1947,13 +1941,13 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_sub_i32 s14, 0, s3 ; GFX9-NEXT: s_ashr_i32 s8, s4, 31 -; GFX9-NEXT: v_mul_f32_e32 v0, s15, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_add_i32 s4, s4, s8 ; GFX9-NEXT: s_xor_b32 s4, s4, s8 ; GFX9-NEXT: v_mul_lo_u32 v2, s14, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s15, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_sub_i32 s14, 0, s9 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -1988,7 +1982,7 @@ ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc @@ -2008,7 +2002,7 @@ ; GFX9-NEXT: s_xor_b32 s6, s6, s5 ; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_xor_b32 s2, s13, s12 @@ -2215,7 +2209,6 @@ ; GFX6-LABEL: srem_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s14, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2223,29 +2216,30 @@ ; GFX6-NEXT: s_add_i32 s8, s8, s2 ; GFX6-NEXT: s_xor_b32 s8, s8, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_ashr_i32 s12, s9, 31 -; GFX6-NEXT: s_add_i32 s9, s9, s12 -; GFX6-NEXT: s_xor_b32 s9, s9, s12 +; GFX6-NEXT: s_ashr_i32 s13, s9, 31 +; GFX6-NEXT: s_add_i32 s9, s9, s13 +; GFX6-NEXT: s_xor_b32 s9, s9, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s13, 0, s8 +; GFX6-NEXT: s_sub_i32 s14, 0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: s_ashr_i32 s12, s4, 31 -; GFX6-NEXT: v_mul_f32_e32 v0, s14, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: s_add_i32 s4, s4, s12 ; GFX6-NEXT: s_xor_b32 s4, s4, s12 -; GFX6-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s14, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s14, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_sub_i32 s13, 0, s9 +; GFX6-NEXT: s_sub_i32 s14, 0, s9 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s13, v1 ; GFX6-NEXT: s_ashr_i32 s13, s5, 31 ; GFX6-NEXT: s_add_i32 s5, s5, s13 +; GFX6-NEXT: s_xor_b32 s5, s5, s13 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s14, v1 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -2253,26 +2247,25 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 -; GFX6-NEXT: s_xor_b32 s4, s5, s13 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: s_ashr_i32 s5, s10, 31 +; GFX6-NEXT: s_ashr_i32 s4, s10, 31 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX6-NEXT: s_add_i32 s8, s10, s5 -; GFX6-NEXT: s_xor_b32 s5, s8, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX6-NEXT: s_add_i32 s8, s10, s4 +; GFX6-NEXT: s_xor_b32 s4, s8, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, s14, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 -; GFX6-NEXT: s_sub_i32 s4, 0, s5 +; GFX6-NEXT: s_sub_i32 s5, 0, s4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 @@ -2280,22 +2273,22 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 ; GFX6-NEXT: s_ashr_i32 s8, s11, 31 ; GFX6-NEXT: s_add_i32 s9, s11, s8 -; GFX6-NEXT: s_ashr_i32 s4, s6, 31 +; GFX6-NEXT: s_ashr_i32 s5, s6, 31 ; GFX6-NEXT: s_xor_b32 s8, s9, s8 -; GFX6-NEXT: s_add_i32 s6, s6, s4 +; GFX6-NEXT: s_add_i32 s6, s6, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX6-NEXT: s_xor_b32 s6, s6, s4 +; GFX6-NEXT: s_xor_b32 s6, s6, s5 ; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, s13, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s13, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s5 -; GFX6-NEXT: v_mul_f32_e32 v3, s14, v3 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v2 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GFX6-NEXT: s_sub_i32 s6, 0, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 @@ -2303,14 +2296,14 @@ ; GFX6-NEXT: s_add_i32 s7, s7, s6 ; GFX6-NEXT: s_xor_b32 s7, s7, s6 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v2 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, s8 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 @@ -2326,37 +2319,37 @@ ; GFX9-LABEL: srem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s13, 0x4f7ffffe -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s8, 31 ; GFX9-NEXT: s_add_i32 s8, s8, s2 ; GFX9-NEXT: s_xor_b32 s2, s8, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_ashr_i32 s3, s9, 31 -; GFX9-NEXT: s_sub_i32 s12, 0, s2 ; GFX9-NEXT: s_add_i32 s8, s9, s3 +; GFX9-NEXT: s_sub_i32 s12, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_xor_b32 s3, s8, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: s_ashr_i32 s8, s4, 31 -; GFX9-NEXT: v_mul_f32_e32 v0, s13, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_add_i32 s4, s4, s8 ; GFX9-NEXT: s_xor_b32 s4, s4, s8 ; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s13, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_sub_i32 s12, 0, s3 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_ashr_i32 s9, s5, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_ashr_i32 s12, s10, 31 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: s_add_i32 s5, s5, s9 ; GFX9-NEXT: s_xor_b32 s5, s5, s9 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 @@ -2367,9 +2360,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s10, 31 -; GFX9-NEXT: s_add_i32 s4, s10, s2 -; GFX9-NEXT: s_xor_b32 s2, s4, s2 +; GFX9-NEXT: s_add_i32 s2, s10, s12 +; GFX9-NEXT: s_xor_b32 s2, s2, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 @@ -2379,7 +2371,7 @@ ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX9-NEXT: v_mul_f32_e32 v2, s13, v2 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 @@ -2398,7 +2390,7 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: s_xor_b32 s5, s5, s4 ; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, s13, v5 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 @@ -2529,32 +2521,32 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s8, s6, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_and_b32 s9, s4, 0xffff +; GFX6-NEXT: s_and_b32 s9, s6, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX6-NEXT: s_lshr_b32 s6, s6, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX6-NEXT: s_and_b32 s8, s4, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX6-NEXT: s_lshr_b32 s4, s4, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: s_and_b32 s4, s7, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -2566,18 +2558,17 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: s_mov_b32 s8, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2633,9 +2624,8 @@ ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] @@ -2748,65 +2738,64 @@ ; GFX6-NEXT: s_and_b32 s8, s6, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: v_alignbit_b32 v4, s7, v4, 16 -; GFX6-NEXT: s_and_b32 s9, s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v5, s8, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_and_b32 s8, s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 -; GFX6-NEXT: v_alignbit_b32 v3, s5, v3, 16 -; GFX6-NEXT: v_and_b32_e32 v6, s8, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, v6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GFX6-NEXT: v_mad_f32 v1, -v1, v5, v6 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_mad_f32 v2, -v2, v5, v6 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5 ; GFX6-NEXT: s_and_b32 s6, s7, 0xffff -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 ; GFX6-NEXT: s_and_b32 s6, s5, 0xffff -; GFX6-NEXT: v_mul_lo_u32 v1, v1, v4 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: s_lshr_b32 s4, s7, 16 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v1, v2 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: s_lshr_b32 s6, s5, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX6-NEXT: v_mad_f32 v4, -v1, v2, v4 +; GFX6-NEXT: v_mad_f32 v4, -v1, v3, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v2, -v2, v5, v6 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, s4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -2868,10 +2857,9 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 ; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] @@ -3053,12 +3041,11 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3132,11 +3119,10 @@ ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 -; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v5, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -3338,12 +3324,11 @@ ; GFX6-NEXT: s_lshr_b32 s4, s5, 16 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3425,11 +3410,10 @@ ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s7 ; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s6, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s5, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v3, v4, v5 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -3996,31 +3980,30 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s6, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX6-NEXT: s_and_b32 s8, s6, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 -; GFX6-NEXT: v_and_b32_e32 v5, s8, v2 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GFX6-NEXT: v_mad_f32 v3, -v4, v0, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v0 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 -; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 -; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GFX6-NEXT: v_alignbit_b32 v0, s5, v0, 16 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 ; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -4041,11 +4024,11 @@ ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -4607,30 +4590,29 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX6-NEXT: s_and_b32 s9, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX6-NEXT: s_and_b32 s3, s2, 0x7fff ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 @@ -4644,9 +4626,9 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 ; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s3, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GFX6-NEXT: v_and_b32_e32 v3, s3, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -4661,18 +4643,17 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_and_b32 s3, s2, 0x7fff -; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_and_b32 s6, s2, 0x7fff +; GFX9-NEXT: s_and_b32 s3, s0, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf000f @@ -4680,14 +4661,14 @@ ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_and_b32_e32 v3, s6, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 -; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 @@ -4701,9 +4682,9 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 ; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 -; GFX9-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v4, s6, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 @@ -4796,55 +4777,54 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s2, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 -; GFX6-NEXT: s_and_b32 s10, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 +; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 +; GFX6-NEXT: s_and_b32 s9, s0, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_bfe_u32 s9, s2, 0xf000f +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 -; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: s_lshr_b32 s8, s2, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 +; GFX6-NEXT: s_lshr_b32 s3, s2, 15 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v1 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v2, s3, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -4858,33 +4838,32 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s6, s2, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_and_b32 s3, s2, 0x7fff -; GFX9-NEXT: s_and_b32 s8, s0, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 -; GFX9-NEXT: s_bfe_u32 s3, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0xf000f +; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: s_bfe_u32 s3, s2, 0xf000f ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_and_b32_e32 v3, s6, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 @@ -4907,9 +4886,9 @@ ; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 ; GFX9-NEXT: v_sub_u32_e32 v5, s2, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 @@ -5057,11 +5036,10 @@ ; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: s_movk_i32 s0, 0x7fff +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -5128,10 +5106,9 @@ ; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 -; GFX9-NEXT: v_and_b32_e32 v3, s0, v4 -; GFX9-NEXT: v_and_b32_e32 v4, s0, v5 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 @@ -5236,18 +5213,18 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s10, s2, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s10 +; GFX6-NEXT: s_bfe_i32 s9, s2, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s9 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX6-NEXT: s_xor_b32 s1, s10, s1 +; GFX6-NEXT: s_xor_b32 s1, s9, s1 ; GFX6-NEXT: s_ashr_i32 s1, s1, 30 ; GFX6-NEXT: s_or_b32 s1, s1, 1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, s1 -; GFX6-NEXT: s_lshr_b32 s9, s0, 15 +; GFX6-NEXT: s_lshr_b32 s8, s0, 15 ; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v6, v6 @@ -5255,29 +5232,28 @@ ; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 ; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX6-NEXT: v_and_b32_e32 v3, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s2, v4 -; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v7, v7 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: s_xor_b32 s0, s1, s0 -; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 ; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX6-NEXT: v_trunc_f32_e32 v7, v7 +; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_or_b32 s0, s0, 1 +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 ; GFX6-NEXT: v_mov_b32_e32 v8, s0 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -5292,15 +5268,15 @@ ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 +; GFX6-NEXT: v_mul_lo_u32 v5, v5, s8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: s_lshr_b32 s8, s2, 15 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v5 +; GFX6-NEXT: s_lshr_b32 s3, s2, 15 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v5 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v3, s3, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -5314,46 +5290,45 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s6, s2, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s6 +; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX9-NEXT: s_xor_b32 s1, s6, s1 -; GFX9-NEXT: s_ashr_i32 s1, s1, 30 -; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 15 -; GFX9-NEXT: s_lshr_b32 s9, s0, 15 -; GFX9-NEXT: s_or_b32 s1, s1, 1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: s_lshr_b32 s8, s2, 15 +; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX9-NEXT: s_cselect_b32 s1, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s1, v6 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s0 +; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 +; GFX9-NEXT: s_lshr_b32 s3, s6, 15 +; GFX9-NEXT: s_or_b32 s7, s0, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: s_cselect_b32 s0, s7, 0 +; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 +; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 ; GFX9-NEXT: s_or_b32 s6, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 @@ -5372,16 +5347,16 @@ ; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v5, s9 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, s3 ; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, s3, v5 +; GFX9-NEXT: v_sub_u32_e32 v4, s8, v5 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 @@ -5671,7 +5646,6 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s0, 0x4f7ffffe ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -5679,12 +5653,12 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: s_sub_i32 s0, 0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 -; GFX6-NEXT: s_sub_i32 s0, 0, s2 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX6-NEXT: s_sub_i32 s0, 0, s3 @@ -5721,20 +5695,19 @@ ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 ; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: s_sub_i32 s3, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -5759,12 +5732,12 @@ ; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -6003,7 +5976,6 @@ ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6011,13 +5983,13 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX6-NEXT: s_sub_i32 s2, 0, s7 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 @@ -6055,16 +6027,15 @@ ; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GFX9-NEXT: s_mov_b32 s6, 0x4f7ffffe +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mul_f32_e32 v0, s6, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s6, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -6471,7 +6442,6 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6481,55 +6451,55 @@ ; GFX6-NEXT: s_xor_b32 s2, s0, s1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s11 -; GFX6-NEXT: s_sub_i32 s11, 0, s2 -; GFX6-NEXT: s_ashr_i32 s10, s0, 31 +; GFX6-NEXT: s_ashr_i32 s3, s0, 31 +; GFX6-NEXT: s_add_i32 s0, s0, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_add_i32 s0, s0, s10 -; GFX6-NEXT: s_ashr_i32 s3, s8, 31 -; GFX6-NEXT: s_add_i32 s8, s8, s3 -; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 +; GFX6-NEXT: s_sub_i32 s11, 0, s2 +; GFX6-NEXT: s_xor_b32 s10, s0, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_xor_b32 s12, s3, s1 -; GFX6-NEXT: v_mul_lo_u32 v1, s11, v0 -; GFX6-NEXT: s_xor_b32 s11, s0, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: s_xor_b32 s0, s8, s3 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_sub_i32 s3, 0, s11 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s13, v2 +; GFX6-NEXT: s_ashr_i32 s0, s8, 31 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX6-NEXT: s_add_i32 s8, s8, s0 +; GFX6-NEXT: v_mul_lo_u32 v2, s11, v0 +; GFX6-NEXT: s_xor_b32 s8, s8, s0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX6-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: s_xor_b32 s11, s0, s1 +; GFX6-NEXT: s_sub_i32 s0, 0, s10 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s2, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: s_add_i32 s1, s9, s0 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: s_xor_b32 s2, s0, s10 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s11 +; GFX6-NEXT: s_xor_b32 s2, s0, s3 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 +; GFX6-NEXT: v_xor_b32_e32 v0, s11, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s11, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s11, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 @@ -6540,7 +6510,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s11, 0x4f7ffffe ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 @@ -6549,47 +6518,44 @@ ; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 -; GFX9-NEXT: s_ashr_i32 s9, s6, 31 -; GFX9-NEXT: s_add_i32 s6, s6, s9 +; GFX9-NEXT: s_ashr_i32 s8, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s8 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s6, s9 +; GFX9-NEXT: s_xor_b32 s6, s6, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_sub_i32 s10, 0, s0 -; GFX9-NEXT: v_mul_f32_e32 v0, s11, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_ashr_i32 s7, s4, 31 ; GFX9-NEXT: s_add_i32 s4, s4, s7 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s11, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s4, s4, s7 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 ; GFX9-NEXT: s_sub_i32 s10, 0, s6 -; GFX9-NEXT: s_ashr_i32 s8, s5, 31 -; GFX9-NEXT: s_add_i32 s5, s5, s8 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 +; GFX9-NEXT: s_xor_b32 s4, s4, s7 +; GFX9-NEXT: v_mul_lo_u32 v4, s10, v1 +; GFX9-NEXT: s_ashr_i32 s9, s5, 31 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: s_xor_b32 s5, s5, s8 -; GFX9-NEXT: s_xor_b32 s1, s7, s1 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 +; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_xor_b32 s5, s5, s9 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s0 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 +; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s0, s8, s9 -; GFX9-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -6597,7 +6563,10 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: s_xor_b32 s1, s7, s1 +; GFX9-NEXT: s_xor_b32 s0, s9, s8 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s1, v0 ; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 @@ -6924,45 +6893,44 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s9, 0x4f7ffffe ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 ; GFX6-NEXT: s_ashr_i32 s3, s2, 31 ; GFX6-NEXT: s_add_i32 s2, s2, s3 ; GFX6-NEXT: s_xor_b32 s6, s2, s3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s7 -; GFX6-NEXT: s_ashr_i32 s7, s2, 31 -; GFX6-NEXT: s_add_i32 s2, s2, s7 +; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 +; GFX6-NEXT: s_ashr_i32 s8, s7, 31 +; GFX6-NEXT: s_add_i32 s7, s7, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_xor_b32 s7, s2, s7 +; GFX6-NEXT: s_xor_b32 s7, s7, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 -; GFX6-NEXT: v_mul_f32_e32 v0, s9, v0 +; GFX6-NEXT: s_sub_i32 s9, 0, s6 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: s_ashr_i32 s8, s4, 31 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX6-NEXT: s_add_i32 s2, s4, s8 -; GFX6-NEXT: v_mul_f32_e32 v1, s9, v1 -; GFX6-NEXT: s_xor_b32 s4, s2, s8 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: s_add_i32 s4, s4, s8 +; GFX6-NEXT: v_mul_lo_u32 v2, s9, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX6-NEXT: s_xor_b32 s4, s4, s8 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_sub_i32 s2, 0, s7 -; GFX6-NEXT: s_ashr_i32 s9, s5, 31 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: s_sub_i32 s9, 0, s7 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 +; GFX6-NEXT: s_ashr_i32 s9, s5, 31 +; GFX6-NEXT: s_add_i32 s5, s5, s9 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GFX6-NEXT: s_add_i32 s4, s5, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: s_xor_b32 s4, s4, s9 +; GFX6-NEXT: s_xor_b32 s4, s5, s9 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 @@ -6986,8 +6954,6 @@ ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s9, 0x4f7ffffe -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 ; GFX9-NEXT: s_ashr_i32 s6, s3, 31 @@ -7003,10 +6969,10 @@ ; GFX9-NEXT: s_sub_i32 s8, 0, s3 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v0, s9, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s4, s4, s6 -; GFX9-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, s8, v0 ; GFX9-NEXT: s_sub_i32 s8, 0, s2 @@ -7022,6 +6988,7 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 @@ -7042,6 +7009,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -9047,9 +9015,6 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 -; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 -; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc -; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[8:9], s[12:13], s8 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s10 @@ -9060,41 +9025,40 @@ ; GFX6-NEXT: s_xor_b64 s[12:13], s[8:9], s[14:15] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 ; GFX6-NEXT: s_sub_u32 s10, 0, s12 ; GFX6-NEXT: s_subb_u32 s11, 0, s13 -; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_ashr_i32 s16, s5, 31 +; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; GFX6-NEXT: s_add_u32 s0, s4, s16 -; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 +; GFX6-NEXT: s_mov_b32 s17, s16 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 +; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s17, s16 ; GFX6-NEXT: s_addc_u32 s1, s5, s16 +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] ; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s10, v0 -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] +; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 +; GFX6-NEXT: s_xor_b64 s[14:15], s[16:17], s[14:15] ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX6-NEXT: s_xor_b64 s[14:15], s[16:17], s[14:15] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc @@ -9176,16 +9140,16 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 -; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 +; GFX6-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX6-NEXT: v_rcp_f32_e32 v3, v8 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v3, s19, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, s20, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mac_f32_e32 v3, s21, v4 +; GFX6-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: s_sub_u32 s0, 0, s2 @@ -9309,9 +9273,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 -; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc -; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 @@ -9322,17 +9283,16 @@ ; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 ; GFX9-NEXT: s_sub_u32 s2, 0, s8 ; GFX9-NEXT: s_subb_u32 s3, 0, s9 -; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s14, s5, 31 +; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s15, s14 -; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 +; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 @@ -9343,16 +9303,16 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -9437,16 +9397,16 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 -; GFX9-NEXT: v_mac_f32_e32 v9, s16, v10 +; GFX9-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc ; GFX9-NEXT: v_rcp_f32_e32 v4, v9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v4, s17, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, s18, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mac_f32_e32 v4, s19, v5 +; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: s_sub_u32 s0, 0, s10 @@ -10220,10 +10180,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 -; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc -; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s10 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 ; GFX6-NEXT: s_ashr_i32 s8, s3, 31 @@ -10233,42 +10191,40 @@ ; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 -; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 ; GFX6-NEXT: s_sub_u32 s2, 0, s16 ; GFX6-NEXT: s_subb_u32 s3, 0, s17 -; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_ashr_i32 s12, s5, 31 +; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; GFX6-NEXT: s_add_u32 s0, s4, s12 -; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 +; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: s_addc_u32 s1, s5, s12 +; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v0 -; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] +; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_mov_b32 s10, -1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc @@ -10343,7 +10299,7 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 -; GFX6-NEXT: v_mac_f32_e32 v6, s18, v7 +; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 @@ -10353,10 +10309,10 @@ ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v3, s19, v6 -; GFX6-NEXT: v_mul_f32_e32 v4, s20, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v6 +; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mac_f32_e32 v3, s21, v4 +; GFX6-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: s_sub_u32 s0, 0, s4 @@ -10478,9 +10434,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 -; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc -; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 @@ -10491,17 +10444,16 @@ ; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 ; GFX9-NEXT: s_sub_u32 s2, 0, s12 ; GFX9-NEXT: s_subb_u32 s3, 0, s13 -; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_ashr_i32 s8, s5, 31 +; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 +; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 @@ -10512,16 +10464,16 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -10600,7 +10552,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s15 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v6, v2, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 -; GFX9-NEXT: v_mac_f32_e32 v7, s16, v8 +; GFX9-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v1 ; GFX9-NEXT: v_rcp_f32_e32 v7, v7 @@ -10610,10 +10562,10 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v4, s17, v7 -; GFX9-NEXT: v_mul_f32_e32 v5, s18, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v7 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mac_f32_e32 v4, s19, v5 +; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: s_sub_u32 s0, 0, s10 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll @@ -33,9 +33,8 @@ ; GCN-LABEL: f: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0xffff80 -; GCN-NEXT: v_or_b32_e32 v0, s4, v0 -; GCN-NEXT: v_or_b32_e32 v1, s4, v1 +; GCN-NEXT: v_or_b32_e32 v0, 0xffff80, v0 +; GCN-NEXT: v_or_b32_e32 v1, 0xffff80, v1 ; GCN-NEXT: v_mul_i32_i24_e32 v0, v0, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 14, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -275,12 +275,10 @@ ; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64: ; SI-DAG: buffer_load_dwordx2 v[[[LO0:[0-9]+]]:[[HI0:[0-9]+]]] ; SI-DAG: buffer_load_dwordx2 v[[[LO1:[0-9]+]]:[[HI1:[0-9]+]]] -; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}} -; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}} -; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]] -; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]] -; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]] -; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]] +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, v[[LO0]] +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, v[[HI0]] +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, v[[LO1]] +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, v[[HI1]] ; SI: buffer_store_dwordx2 ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -518,7 +518,6 @@ ; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 ; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: s_mov_b32 s5, 0xffff0000 ; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8 @@ -529,9 +528,9 @@ ; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 ; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 -; SI-NEXT: v_and_b32_e32 v4, s5, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, s5, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -250,25 +250,23 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s12, 0xff00 -; SI-NEXT: s_movk_i32 s13, 0xff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 -; SI-NEXT: v_and_b32_e32 v2, s12, v0 -; SI-NEXT: v_and_b32_e32 v4, s12, v1 -; SI-NEXT: v_and_b32_e32 v3, s13, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v1, s13, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -283,18 +281,15 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s12, 0xff00 -; VI-NEXT: s_movk_i32 s13, 0xff -; VI-NEXT: s_movk_i32 s14, 0x900 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 @@ -303,16 +298,16 @@ ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v4, s12, v1 +; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 -; VI-NEXT: v_and_b32_e32 v1, s13, v1 -; VI-NEXT: v_and_b32_e32 v2, s12, v0 -; VI-NEXT: v_and_b32_e32 v3, s13, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v1, s14, v1 -; VI-NEXT: v_add_u16_e32 v2, s14, v2 +; VI-NEXT: v_add_u16_e32 v1, 0x900, v1 +; VI-NEXT: v_add_u16_e32 v2, 0x900, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -340,8 +335,6 @@ ; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b32 s16, 0xff00 -; SI-NEXT: s_movk_i32 s17, 0xff ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s8, s0 @@ -353,12 +346,12 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 -; SI-NEXT: v_and_b32_e32 v2, s16, v0 -; SI-NEXT: v_and_b32_e32 v4, s16, v1 -; SI-NEXT: v_and_b32_e32 v3, s17, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v1, s17, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -374,17 +367,14 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s16, 0xff00 -; VI-NEXT: s_movk_i32 s17, 0xff -; VI-NEXT: s_movk_i32 s18, 0x900 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 @@ -394,16 +384,16 @@ ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v4, s16, v1 +; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 -; VI-NEXT: v_and_b32_e32 v1, s17, v1 -; VI-NEXT: v_and_b32_e32 v2, s16, v0 -; VI-NEXT: v_and_b32_e32 v3, s17, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v1, s18, v1 -; VI-NEXT: v_add_u16_e32 v2, s18, v2 +; VI-NEXT: v_add_u16_e32 v1, 0x900, v1 +; VI-NEXT: v_add_u16_e32 v2, 0x900, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -409,12 +409,11 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, s4, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, s4, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 @@ -433,7 +432,6 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -444,8 +442,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; VI-NEXT: v_and_b32_e32 v1, s4, v1 -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 @@ -535,16 +533,15 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, s4, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v5, s4, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v6, s4, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v7, s4, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 @@ -571,7 +568,6 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -584,10 +580,10 @@ ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s4, v3 -; VI-NEXT: v_and_b32_e32 v2, s4, v2 -; VI-NEXT: v_and_b32_e32 v1, s4, v1 -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 ; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 ; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 @@ -718,25 +714,24 @@ ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 ; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, s4, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v9, s4, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v10, s4, v2 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v11, s4, v3 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v12, s4, v4 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v13, s4, v5 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v14, s4, v6 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v15, s4, v7 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 @@ -780,7 +775,6 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 @@ -796,10 +790,10 @@ ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s4, v3 -; VI-NEXT: v_and_b32_e32 v2, s4, v2 -; VI-NEXT: v_and_b32_e32 v1, s4, v1 -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 @@ -809,10 +803,10 @@ ; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0 ; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0 ; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0 -; VI-NEXT: v_and_b32_e32 v7, s4, v7 -; VI-NEXT: v_and_b32_e32 v6, s4, v6 -; VI-NEXT: v_and_b32_e32 v5, s4, v5 -; VI-NEXT: v_and_b32_e32 v4, s4, v4 +; VI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; VI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1172,7 +1172,6 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s8, 0xff ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 @@ -1186,11 +1185,11 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s8, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 ; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_and_b32_e32 v2, s8, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1297,7 +1296,7 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v7 ; GFX9-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u16_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v5, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -116,14 +116,13 @@ ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, v4, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -261,13 +260,12 @@ ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1] ; GFX9-NEXT: s_movk_i32 s4, 0x8000 ; GFX9-NEXT: v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1] ; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -136,8 +136,7 @@ } ; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32: -; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000 -; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}} +; GCN: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+$}} ; GCN-NOT: v_mul ; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] @@ -152,7 +151,7 @@ } ; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32: -; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} ; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000 ; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]] ; GCN-NOT: v_mul diff --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll --- a/llvm/test/CodeGen/AMDGPU/fexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fexp.ll @@ -18,9 +18,8 @@ ; GCN-LABEL: v_exp_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x3fb8aa3b -; GCN-NEXT: v_mul_f32_e32 v0, s4, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: v_exp_f32_e32 v1, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -32,10 +31,9 @@ ; GCN-LABEL: v_exp_v3f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x3fb8aa3b -; GCN-NEXT: v_mul_f32_e32 v0, s4, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: v_exp_f32_e32 v1, v1 ; GCN-NEXT: v_exp_f32_e32 v2, v2 @@ -48,11 +46,10 @@ ; GCN-LABEL: v_exp_v4f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x3fb8aa3b -; GCN-NEXT: v_mul_f32_e32 v0, s4, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 -; GCN-NEXT: v_mul_f32_e32 v3, s4, v3 +; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: v_exp_f32_e32 v1, v1 ; GCN-NEXT: v_exp_f32_e32 v2, v2 @@ -95,11 +92,10 @@ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s4, 0x3fb8aa3b ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, s4, v0 -; SI-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; SI-NEXT: v_exp_f32_e32 v0, v0 ; SI-NEXT: v_exp_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -141,15 +137,14 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s4, 0x3fb8aa3b ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, s4, v0 -; SI-NEXT: v_mul_f32_e32 v1, s4, v1 -; SI-NEXT: v_mul_f32_e32 v2, s4, v2 -; SI-NEXT: v_mul_f32_e32 v3, s4, v3 +; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; SI-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 ; SI-NEXT: v_exp_f32_e32 v0, v0 ; SI-NEXT: v_exp_f32_e32 v1, v1 ; SI-NEXT: v_exp_f32_e32 v2, v2 @@ -159,11 +154,10 @@ ; VI-LABEL: v_exp_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_movk_i32 s4, 0x3dc5 ; VI-NEXT: v_mov_b32_e32 v3, 0x3dc5 -; VI-NEXT: v_mul_f16_e32 v2, s4, v1 +; VI-NEXT: v_mul_f16_e32 v2, 0x3dc5, v1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mul_f16_e32 v4, s4, v0 +; VI-NEXT: v_mul_f16_e32 v4, 0x3dc5, v0 ; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_exp_f16_e32 v2, v2 ; VI-NEXT: v_exp_f16_e32 v4, v4 @@ -177,9 +171,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_movk_i32 s4, 0x3dc5 -; GFX9-NEXT: v_mul_f16_e32 v2, s4, v1 +; GFX9-NEXT: v_mul_f16_e32 v2, 0x3dc5, v1 ; GFX9-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_mul_f16_e32 v3, s4, v0 +; GFX9-NEXT: v_mul_f16_e32 v3, 0x3dc5, v0 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_exp_f16_e32 v2, v2 ; GFX9-NEXT: v_exp_f16_e32 v3, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -515,13 +515,12 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_add_u32_e32 v1, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -548,16 +547,15 @@ ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_add_u32_e32 v1, 4, v0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm @@ -1367,12 +1365,11 @@ ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -1402,18 +1399,17 @@ ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x104, v0 +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm @@ -2254,12 +2250,11 @@ ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -2289,18 +2284,17 @@ ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -241,9 +241,8 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -369,11 +368,10 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SAFE-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-SAFE-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX9-SAFE-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-SAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v8, 16, v0 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v10, 16, v1 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v2, v12, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -242,9 +242,8 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -370,11 +369,10 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SAFE-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-SAFE-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX9-SAFE-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-SAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v8, 16, v0 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v10, 16, v1 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v2, v12, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -161,13 +161,12 @@ } ; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32: -; GCN-SAFE-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}} ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[SIGNBIT]], [[A]] +; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[SIGNBIT]], [[ADD]] +; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir --- a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir @@ -170,9 +170,8 @@ # operands # CHECK-LABEL: name: add_f32_1.0_multi_f16_use -# CHECK: %13:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec -# CHECK: %14:vgpr_32 = V_ADD_F16_e32 killed %11, %13, implicit $mode, implicit $exec -# CHECK: %15:vgpr_32 = V_ADD_F16_e32 killed %12, killed %13, implicit $mode, implicit $exec +# CHECK: %14:vgpr_32 = V_ADD_F16_e32 1065353216, killed %11, implicit $mode, implicit $exec +# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, killed %12, implicit $mode, implicit $exec name: add_f32_1.0_multi_f16_use @@ -306,9 +305,8 @@ # constant, and not folded as a multi-use literal for the f16 cases # CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use -# CHECK: %14:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec -# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %11, %14, implicit $mode, implicit $exec -# CHECK: %16:vgpr_32 = V_ADD_F16_e32 %12, %14, implicit $mode, implicit $exec +# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, %11, implicit $mode, implicit $exec +# CHECK: %16:vgpr_32 = V_ADD_F16_e32 1065353216, %12, implicit $mode, implicit $exec # CHECK: %17:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit $mode, implicit $exec name: add_f32_1.0_one_f32_use_multi_f16_use @@ -511,9 +509,8 @@ # constant, and not folded as a multi-use literal for the f16 cases # CHECK-LABEL: name: add_f16_1.0_multi_f32_use -# CHECK: %13:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec -# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $mode, implicit $exec -# CHECK: %15:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec +# CHECK: %14:vgpr_32 = V_ADD_F32_e32 15360, %11, implicit $mode, implicit $exec +# CHECK: %15:vgpr_32 = V_ADD_F32_e32 15360, %12, implicit $mode, implicit $exec name: add_f16_1.0_multi_f32_use alignment: 1 @@ -575,12 +572,10 @@ --- # The low 16-bits are an inline immediate, but the high bits are junk -# FIXME: Should be able to fold this # CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use -# CHECK: %13:vgpr_32 = V_MOV_B32_e32 80886784, implicit $exec -# CHECK: %14:vgpr_32 = V_ADD_F16_e32 %11, %13, implicit $mode, implicit $exec -# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $mode, implicit $exec +# CHECK: %14:vgpr_32 = V_ADD_F16_e32 80886784, %11, implicit $mode, implicit $exec +# CHECK: %15:vgpr_32 = V_ADD_F16_e32 80886784, %12, implicit $mode, implicit $exec name: add_f16_1.0_other_high_bits_multi_f16_use alignment: 1 @@ -641,13 +636,9 @@ ... --- -# FIXME: Should fold inline immediate into f16 and literal use into -# f32 instruction. - # CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32 -# CHECK: %13:vgpr_32 = V_MOV_B32_e32 305413120, implicit $exec -# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $mode, implicit $exec -# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $mode, implicit $exec +# CHECK: %14:vgpr_32 = V_ADD_F32_e32 305413120, %11, implicit $mode, implicit $exec +# CHECK: %15:vgpr_32 = V_ADD_F16_e32 305413120, %12, implicit $mode, implicit $exec name: add_f16_1.0_other_high_bits_use_f16_f32 alignment: 1 exposesReturnsTwice: false diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -242,19 +242,18 @@ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, 0x2f800000 -; SI-NEXT: s_mov_b32 s3, 0xcf800000 +; SI-NEXT: s_mov_b32 s2, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_trunc_f32_e32 v0, s1 ; SI-NEXT: v_trunc_f32_e32 v2, s0 -; SI-NEXT: v_mul_f32_e32 v1, s2, v0 -; SI-NEXT: v_mul_f32_e32 v3, s2, v2 +; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_floor_f32_e32 v4, v1 ; SI-NEXT: v_floor_f32_e32 v5, v3 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v5 -; SI-NEXT: v_fma_f32 v0, v4, s3, v0 -; SI-NEXT: v_fma_f32 v4, v5, s3, v2 +; SI-NEXT: v_fma_f32 v0, v4, s2, v0 +; SI-NEXT: v_fma_f32 v4, v5, s2, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v2, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -264,12 +263,11 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s4, 0x2f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trunc_f32_e32 v0, s3 ; VI-NEXT: v_trunc_f32_e32 v4, s2 -; VI-NEXT: v_mul_f32_e32 v1, s4, v0 -; VI-NEXT: v_mul_f32_e32 v2, s4, v4 +; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4 ; VI-NEXT: v_floor_f32_e32 v5, v1 ; VI-NEXT: s_mov_b32 s2, 0xcf800000 ; VI-NEXT: v_floor_f32_e32 v6, v2 @@ -379,29 +377,28 @@ ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s8, 0x2f800000 -; SI-NEXT: s_mov_b32 s9, 0xcf800000 +; SI-NEXT: s_mov_b32 s8, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_trunc_f32_e32 v0, s1 ; SI-NEXT: v_trunc_f32_e32 v2, s0 ; SI-NEXT: v_trunc_f32_e32 v4, s3 ; SI-NEXT: v_trunc_f32_e32 v6, s2 -; SI-NEXT: v_mul_f32_e32 v1, s8, v0 -; SI-NEXT: v_mul_f32_e32 v3, s8, v2 -; SI-NEXT: v_mul_f32_e32 v5, s8, v4 -; SI-NEXT: v_mul_f32_e32 v7, s8, v6 +; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; SI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; SI-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 ; SI-NEXT: v_floor_f32_e32 v8, v1 ; SI-NEXT: v_floor_f32_e32 v9, v3 ; SI-NEXT: v_floor_f32_e32 v10, v5 ; SI-NEXT: v_floor_f32_e32 v11, v7 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v9 -; SI-NEXT: v_fma_f32 v0, v8, s9, v0 -; SI-NEXT: v_fma_f32 v8, v9, s9, v2 +; SI-NEXT: v_fma_f32 v0, v8, s8, v0 +; SI-NEXT: v_fma_f32 v8, v9, s8, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v7, v10 ; SI-NEXT: v_cvt_u32_f32_e32 v5, v11 -; SI-NEXT: v_fma_f32 v4, v10, s9, v4 -; SI-NEXT: v_fma_f32 v9, v11, s9, v6 +; SI-NEXT: v_fma_f32 v4, v10, s8, v4 +; SI-NEXT: v_fma_f32 v9, v11, s8, v6 ; SI-NEXT: v_cvt_u32_f32_e32 v2, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v6, v4 @@ -414,34 +411,33 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s2, 0x2f800000 -; VI-NEXT: s_mov_b32 s3, 0xcf800000 +; VI-NEXT: s_mov_b32 s2, 0xcf800000 +; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trunc_f32_e32 v0, s5 ; VI-NEXT: v_trunc_f32_e32 v4, s4 -; VI-NEXT: v_mul_f32_e32 v1, s2, v0 -; VI-NEXT: v_mul_f32_e32 v2, s2, v4 +; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4 ; VI-NEXT: v_floor_f32_e32 v5, v1 ; VI-NEXT: v_floor_f32_e32 v6, v2 -; VI-NEXT: v_fma_f32 v0, v5, s3, v0 +; VI-NEXT: v_fma_f32 v0, v5, s2, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v0 -; VI-NEXT: v_fma_f32 v0, v6, s3, v4 +; VI-NEXT: v_fma_f32 v0, v6, s2, v4 ; VI-NEXT: v_trunc_f32_e32 v4, s7 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v5 -; VI-NEXT: v_mul_f32_e32 v5, s2, v4 +; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; VI-NEXT: v_trunc_f32_e32 v8, s6 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v6 ; VI-NEXT: v_floor_f32_e32 v6, v5 -; VI-NEXT: v_mul_f32_e32 v5, s2, v8 +; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v8 ; VI-NEXT: v_floor_f32_e32 v9, v5 -; VI-NEXT: v_fma_f32 v4, v6, s3, v4 +; VI-NEXT: v_fma_f32 v4, v6, s2, v4 ; VI-NEXT: v_cvt_u32_f32_e32 v7, v6 ; VI-NEXT: v_cvt_u32_f32_e32 v6, v4 -; VI-NEXT: v_fma_f32 v4, v9, s3, v8 +; VI-NEXT: v_fma_f32 v4, v9, s2, v8 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v9 ; VI-NEXT: v_cvt_u32_f32_e32 v4, v4 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/fpow.ll @@ -479,14 +479,13 @@ ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: s_mov_b32 s4, 0x80008000 -; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -2619,8 +2619,7 @@ ; SI-NEXT: v_and_b32_e32 v10, v8, v10 ; SI-NEXT: v_not_b32_e32 v11, v11 ; SI-NEXT: v_and_b32_e32 v11, v9, v11 -; SI-NEXT: s_brev_b32 s8, 1 -; SI-NEXT: v_and_b32_e32 v13, s8, v9 +; SI-NEXT: v_and_b32_e32 v13, 0x80000000, v9 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12 ; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12 @@ -2650,7 +2649,7 @@ ; SI-NEXT: v_and_b32_e32 v8, v6, v8 ; SI-NEXT: v_not_b32_e32 v9, v9 ; SI-NEXT: v_and_b32_e32 v9, v7, v9 -; SI-NEXT: v_and_b32_e32 v11, s8, v7 +; SI-NEXT: v_and_b32_e32 v11, 0x80000000, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -697,10 +697,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX9-NEXT: s_mov_b32 s4, 0xf000f ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -737,14 +736,13 @@ ; SI-NEXT: v_or_b32_e32 v4, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4 -; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_or_b32_e32 v3, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v2, s4, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -844,12 +842,11 @@ ; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 ; SI-NEXT: v_or_b32_e32 v4, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 @@ -911,10 +908,9 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1164,16 +1160,15 @@ ; SI-LABEL: v_fshr_v2i24: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, 0xffffff -; SI-NEXT: v_and_b32_e32 v6, s4, v4 -; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab -; SI-NEXT: v_mul_hi_u32 v6, v6, s5 -; SI-NEXT: v_and_b32_e32 v7, s4, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab +; SI-NEXT: v_mul_hi_u32 v6, v6, s4 +; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; SI-NEXT: v_mul_hi_u32 v6, v7, s5 +; SI-NEXT: v_mul_hi_u32 v6, v7, s4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 @@ -1187,16 +1182,15 @@ ; VI-LABEL: v_fshr_v2i24: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, 0xffffff -; VI-NEXT: v_and_b32_e32 v6, s4, v4 -; VI-NEXT: s_mov_b32 s5, 0xaaaaaaab -; VI-NEXT: v_mul_hi_u32 v6, v6, s5 -; VI-NEXT: v_and_b32_e32 v7, s4, v5 +; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab +; VI-NEXT: v_mul_hi_u32 v6, v6, s4 +; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_mul_hi_u32 v6, v7, s5 +; VI-NEXT: v_mul_hi_u32 v6, v7, s4 ; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 @@ -1210,16 +1204,15 @@ ; GFX9-LABEL: v_fshr_v2i24: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v6, s4, v4 -; GFX9-NEXT: s_mov_b32 s5, 0xaaaaaaab -; GFX9-NEXT: v_mul_hi_u32 v6, v6, s5 -; GFX9-NEXT: v_and_b32_e32 v7, s4, v5 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v7, s5 +; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4 ; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -24,17 +24,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s5 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -44,7 +43,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -54,15 +52,15 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -163,19 +161,18 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s5, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -577,17 +574,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s5 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -597,7 +593,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -607,15 +602,15 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -627,20 +622,19 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v2 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm @@ -998,17 +992,16 @@ ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s5 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1018,7 +1011,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1028,15 +1020,15 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s2, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1134,17 +1126,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s5 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1154,7 +1145,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 @@ -1168,15 +1158,15 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1275,17 +1265,16 @@ ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s5 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1295,7 +1284,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1305,15 +1293,15 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s3 +; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1418,17 +1406,16 @@ ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s5 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1438,7 +1425,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1448,15 +1434,15 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s3 +; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1560,17 +1546,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s5 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v2, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1580,7 +1565,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1590,15 +1574,15 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1702,17 +1686,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s5 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1723,7 +1706,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1733,15 +1715,15 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2006,17 +1988,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s5 +; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s4 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2027,7 +2008,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2037,15 +2017,15 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s3 +; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s2 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2058,20 +2038,19 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v2 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm @@ -2081,20 +2060,19 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm @@ -2324,17 +2302,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s5 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s4 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2345,7 +2322,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2355,15 +2331,15 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2638,13 +2614,12 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -188,28 +188,27 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -418,12 +417,11 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 @@ -964,34 +962,33 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 ; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v7 ; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v8, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -21,17 +21,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s5 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s4 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 @@ -47,7 +46,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -57,17 +55,17 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s3 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 @@ -189,12 +187,11 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 @@ -226,17 +223,16 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v6, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX8-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v7, s0, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX8-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v4, v6, v7, v4 ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -259,13 +255,13 @@ ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v1 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v5, s0, v2 +; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX9-NODL-NEXT: v_and_b32_e32 v6, s0, v6 -; GFX9-NODL-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 ; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -290,13 +286,13 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xff, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v6, s0, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 ; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -401,12 +397,11 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 @@ -585,12 +580,11 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v4, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 @@ -727,12 +721,11 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 @@ -911,13 +904,12 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 @@ -1092,17 +1084,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s5 +; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s4 ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 @@ -1119,7 +1110,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1129,17 +1119,17 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s3 +; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s2 ; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 @@ -1156,22 +1146,21 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v2 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] @@ -1182,22 +1171,21 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX9-DL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-DL-NEXT: v_add3_u32 v1, v2, v6, v1 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] @@ -1287,24 +1275,23 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s5 +; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s4 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, s5, v3 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, s4, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1315,7 +1302,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1325,22 +1311,22 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s3 +; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s3, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v4 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1484,15 +1470,14 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 @@ -1523,16 +1508,15 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 -; GFX8-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8 ; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX8-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX8-NEXT: v_bfe_i32 v7, v2, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v4, v8, v9, v4 @@ -1559,8 +1543,8 @@ ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX9-NODL-NEXT: v_and_b32_e32 v6, s0, v6 -; GFX9-NODL-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) @@ -1590,8 +1574,8 @@ ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v6, s0, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v5, v2, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -1697,21 +1681,20 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s5 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v5, v0 @@ -1723,7 +1706,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1733,19 +1715,19 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, 8, v3 -; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 @@ -1867,22 +1849,20 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xff00 -; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v7, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v6 ; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 @@ -1913,17 +1893,16 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v3 ; GFX8-NEXT: v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v9, 8, v2 ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 ; GFX8-NEXT: v_mad_u16 v2, v7, v9, v2 @@ -1956,14 +1935,14 @@ ; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 -; GFX9-NODL-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX9-NODL-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v8, 16, v10 -; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 +; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v6, 16, v9 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 -; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 +; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v5, v4 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1994,14 +1973,14 @@ ; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v8, 16, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v9 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v5, v4 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2094,12 +2073,11 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v4, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v7, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -344,49 +344,48 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 ; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 ; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 ; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 ; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v14, s4, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 ; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v15, s4, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v16, s4, v16 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -917,49 +916,48 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 4 ; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v10, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX7-NEXT: v_bfe_i32 v11, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 ; GFX7-NEXT: v_bfe_i32 v6, v2, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX7-NEXT: v_bfe_i32 v13, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 ; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 ; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v14, s4, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 ; GFX7-NEXT: v_bfe_i32 v9, v2, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX7-NEXT: v_bfe_i32 v16, v0, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v15, s4, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v16, s4, v16 +; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -2204,7 +2202,6 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v3, v2, 20, 4 @@ -2212,9 +2209,9 @@ ; GFX7-NEXT: v_bfe_i32 v5, v2, 4, 4 ; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v10, v0, 20, 4 ; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4 @@ -2223,19 +2220,19 @@ ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GFX7-NEXT: v_and_b32_e32 v11, s4, v13 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v13 ; GFX7-NEXT: v_bfe_i32 v14, v0, 24, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 ; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 ; GFX7-NEXT: v_or_b32_e32 v6, v11, v10 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v14 -; GFX7-NEXT: v_and_b32_e32 v14, s4, v16 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4 ; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2243,22 +2240,22 @@ ; GFX7-NEXT: v_bfe_i32 v7, v2, 24, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2 ; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v13, s4, v15 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v15 ; GFX7-NEXT: v_mad_u32_u24 v1, v16, v11, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v5, v0 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX7-NEXT: v_mad_u32_u24 v0, v15, v10, v0 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v9, v14, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -2845,8 +2842,6 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff -; GFX7-NEXT: s_mov_b32 s5, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 28, v2 @@ -2858,13 +2853,13 @@ ; GFX7-NEXT: v_bfe_i32 v9, v2, 4, 4 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_ashrrev_i32_e32 v11, 28, v0 ; GFX7-NEXT: v_bfe_i32 v12, v0, 24, 4 @@ -2879,30 +2874,30 @@ ; GFX7-NEXT: v_or_b32_e32 v6, v8, v7 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v11 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v12 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v13 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v14 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v15 -; GFX7-NEXT: v_and_b32_e32 v13, s4, v16 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v16 ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v17 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_or_b32_e32 v7, v8, v7 ; GFX7-NEXT: v_or_b32_e32 v8, v10, v9 ; GFX7-NEXT: v_or_b32_e32 v9, v13, v12 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v14 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v13, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v6, s5, v8 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 8 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2914,8 +2909,8 @@ ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v4 -; GFX7-NEXT: v_and_b32_e32 v15, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v4 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v5 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0 ; GFX7-NEXT: v_bfe_u32 v10, v4, 8, 8 ; GFX7-NEXT: v_bfe_u32 v16, v5, 8, 8 @@ -2923,8 +2918,8 @@ ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v0, v10, v16, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v11, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2119,7 +2119,6 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xf0000 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_u32 v8, v2, 20, 4 @@ -2130,12 +2129,12 @@ ; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v7, 15, v2 ; GFX7-NEXT: v_alignbit_b32 v2, v8, v2, 16 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v9 +; GFX7-NEXT: v_and_b32_e32 v8, 0xf0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v0 ; GFX7-NEXT: v_and_b32_e32 v14, 15, v0 ; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v9 +; GFX7-NEXT: v_and_b32_e32 v8, 0xf0000, v9 ; GFX7-NEXT: v_or_b32_e32 v8, v14, v8 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 15, v7 @@ -2226,55 +2225,54 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[2:3] -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v6, 15, v1 -; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 16, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1 -; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX9-NEXT: v_bfe_u32 v4, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v11, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 16, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 28, v2 ; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-NEXT: v_and_b32_e32 v17, v4, v17 -; GFX9-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX9-NEXT: v_and_b32_e32 v15, v4, v15 -; GFX9-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX9-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v8, v12, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v10, v14, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v10 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v6, v16, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2293,55 +2291,54 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[2:3] -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 16, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 16, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 16, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 28, v2 ; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v17 -; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX9-DL-NEXT: v_and_b32_e32 v15, v4, v15 -; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX9-DL-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v12, 16, v13 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v14, 16, v15 +; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v18, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v11, 16, v1 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v16, 16, v17 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2479,8 +2476,6 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xf00 -; GFX7-NEXT: s_movk_i32 s5, 0xf0f ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 4, v2 @@ -2492,27 +2487,27 @@ ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 4, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v8, 0xf00, v8 +; GFX7-NEXT: v_and_b32_e32 v4, 0xf00, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 15, v2 ; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v12, 15, v0 ; GFX7-NEXT: v_bfe_u32 v14, v0, 16, 4 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 12, v0 ; GFX7-NEXT: v_alignbit_b32 v2, v6, v2, 24 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v9 +; GFX7-NEXT: v_and_b32_e32 v6, 0xf00, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 4, v0 ; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_alignbit_b32 v0, v13, v0, 24 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v8, 0xf00, v11 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v15 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v9 +; GFX7-NEXT: v_and_b32_e32 v4, 0xf00, v15 +; GFX7-NEXT: v_and_b32_e32 v6, 0xf00, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xf0f, v0 ; GFX7-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xf0f, v2 ; GFX7-NEXT: v_or_b32_e32 v4, v14, v4 ; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 @@ -2980,55 +2975,54 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] -; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v6, 15, v1 -; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 16, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v1 -; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX9-NEXT: v_bfe_u32 v4, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v11, v2, 4, 4 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 +; GFX9-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v7, v11, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 16, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 28, v2 ; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-NEXT: v_and_b32_e32 v17, v4, v17 -; GFX9-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX9-NEXT: v_and_b32_e32 v15, v4, v15 -; GFX9-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX9-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v8, v12, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-NEXT: v_lshl_or_b32 v9, v9, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v10, v14, 16, v15 +; GFX9-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v9, v13, 16, v14 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v10 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v2, v18, 16, v2 -; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v6, v16, 16, v17 +; GFX9-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v10, v15, 16, v16 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v9, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -3048,55 +3042,54 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[2:3] -; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 16, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 16, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 28, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v11, v2, 4, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v11, 16, v12 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 16, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 28, v2 ; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v17, v4, v17 -; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX9-DL-NEXT: v_and_b32_e32 v15, v4, v15 -; GFX9-DL-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX9-DL-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v12, 16, v13 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v14, 16, v15 +; GFX9-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; GFX9-DL-NEXT: v_lshl_or_b32 v9, v13, 16, v14 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v18, 16, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v1, v11, 16, v1 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v16, 16, v17 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v17, 16, v2 +; GFX9-DL-NEXT: v_lshl_or_b32 v1, v10, 16, v1 +; GFX9-DL-NEXT: v_lshl_or_b32 v10, v15, 16, v16 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v9, v6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1304,13 +1304,12 @@ ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_movk_i32 s4, 0xff -; SI-NEXT: s_lshr_b32 s5, s11, 8 +; SI-NEXT: s_lshr_b32 s4, s11, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, s4, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_cmp_lg_u32 s6, 13 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s6, 12 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc @@ -1318,28 +1317,27 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_mov_b32 s5, 0xffff -; SI-NEXT: s_lshr_b32 s7, s10, 24 +; SI-NEXT: s_lshr_b32 s4, s10, 24 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_cmp_lg_u32 s6, 11 ; SI-NEXT: v_or_b32_e32 v3, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s7, s10, 16 +; SI-NEXT: s_lshr_b32 s4, s10, 16 ; SI-NEXT: s_cmp_lg_u32 s6, 10 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_lshr_b32 s7, s10, 8 +; SI-NEXT: s_lshr_b32 s4, s10, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, s4, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_cmp_lg_u32 s6, 9 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s6, 8 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc @@ -1347,27 +1345,27 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_lshr_b32 s7, s9, 24 +; SI-NEXT: s_lshr_b32 s4, s9, 24 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_cmp_lg_u32 s6, 7 ; SI-NEXT: v_or_b32_e32 v2, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s7, s9, 16 +; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: s_cmp_lg_u32 s6, 6 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_lshr_b32 s7, s9, 8 +; SI-NEXT: s_lshr_b32 s4, s9, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, s4, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_cmp_lg_u32 s6, 5 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s6, 4 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc @@ -1375,27 +1373,27 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v4, s4, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: s_lshr_b32 s7, s8, 24 +; SI-NEXT: s_lshr_b32 s4, s8, 24 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_cmp_lg_u32 s6, 3 ; SI-NEXT: v_or_b32_e32 v1, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s7, s8, 16 +; SI-NEXT: s_lshr_b32 s4, s8, 16 ; SI-NEXT: s_cmp_lg_u32 s6, 2 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; SI-NEXT: s_lshr_b32 s7, s8, 8 +; SI-NEXT: s_lshr_b32 s4, s8, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v4, s4, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: s_cmp_lg_u32 s6, 1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc @@ -1403,10 +1401,10 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_and_b32_e32 v5, s4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, s5, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1881,47 +1881,46 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v3, v5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2 -; GFX9-NEXT: v_lshl_or_b32 v3, v7, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc +; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s7, 0 -; GFX9-NEXT: v_lshl_or_b32 v2, v8, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v6, vcc +; GFX9-NEXT: v_lshl_or_b32 v2, v7, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v5, vcc ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v8, 16, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -553,10 +553,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_d_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-NEXT: v_and_b32_e32 v4, v6, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX9-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -585,13 +584,12 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v11, v7, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-NEXT: v_and_b32_e32 v5, v2, v6 -; GFX9-NEXT: v_and_b32_e32 v3, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_lshl_or_b32 v11, v7, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: image_sample_d v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -636,13 +634,12 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_c_d_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v9, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v1, v9, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 ; GFX9-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 @@ -689,10 +686,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_d_cl_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX9-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX9-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 @@ -742,12 +738,11 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: v_and_b32_e32 v5, v0, v5 -; GFX9-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 ; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -789,10 +784,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_cd_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-NEXT: v_and_b32_e32 v4, v6, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX9-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -836,13 +830,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_c_cd_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v9, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v1, v9, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 ; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 @@ -889,10 +882,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_cd_cl_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX9-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX9-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 @@ -942,12 +934,11 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: v_and_b32_e32 v5, v0, v5 -; GFX9-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 ; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1136,13 +1127,12 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v13, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v6 -; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 ; GFX9-NEXT: image_sample_c_d_o v0, v[8:13], s[0:7], s[8:11] dmask:0x4 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1172,13 +1162,12 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v13, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v6 -; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 ; GFX9-NEXT: image_sample_c_d_o v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -84,16 +84,15 @@ ; GFX6-NEXT: s_mov_b32 s8, s2 ; GFX6-NEXT: s_mov_b32 s9, s3 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s2, 0x3e22f983 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x3e22f983, v1 ; GFX6-NEXT: v_fract_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_cos_f32_e32 v0, v0 ; GFX6-NEXT: v_cos_f32_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll @@ -32,25 +32,22 @@ ; SI: buffer_load_dword v[[A_F16_0:[0-9]+]] ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] -; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218 -; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c ; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x398c ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] -; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], 0x3f317218, v[[R_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]] -; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]] +; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], 0x3f317218, v[[R_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]] ; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]] ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] ; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] -; VI: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]] -; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] +; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x398c, v[[R_F16_2]] +; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -32,10 +32,9 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %res = call <2 x float> @llvm.log.v2f32(<2 x float> %in) @@ -67,14 +66,13 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %res = call <4 x float> @llvm.log.v4f32(<4 x float> %in) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll @@ -32,25 +32,23 @@ ; SI: buffer_load_dword v[[A_F16_0:[0-9]+]] ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] -; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a -; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1 ; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x34d1 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] -; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], 0x3e9a209a, v[[R_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]] -; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]] +; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], 0x3e9a209a, v[[R_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]] ; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]] ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] ; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_0]] -; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] -; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] +; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_2]] +; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x34d1, v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -32,10 +32,9 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3e9a209a ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %res = call <2 x float> @llvm.log10.v2f32(<2 x float> %in) @@ -67,14 +66,13 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3e9a209a ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %res = call <4 x float> @llvm.log10.v4f32(<4 x float> %in) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -84,16 +84,15 @@ ; GFX6-NEXT: s_mov_b32 s8, s2 ; GFX6-NEXT: s_mov_b32 s9, s3 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s2, 0x3e22f983 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x3e22f983, v1 ; GFX6-NEXT: v_fract_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_sin_f32_e32 v0, v0 ; GFX6-NEXT: v_sin_f32_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1357,13 +1357,12 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s2, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -1375,13 +1374,12 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx2 v[3:4], v[0:1] -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s2, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s2, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GCN-HSA-NEXT: flat_store_dwordx3 v[5:6], v[0:2] ; GCN-HSA-NEXT: s_endpgm ; @@ -1396,13 +1394,12 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s2, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s2, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -1587,14 +1584,13 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s2, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -1605,14 +1601,13 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s2, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s2, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -1627,14 +1622,13 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s2, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s2, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -1838,7 +1832,6 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) @@ -1846,10 +1839,10 @@ ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s2, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s2, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s2, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s2, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -1857,7 +1850,6 @@ ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1871,12 +1863,12 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s4, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s4, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: s_endpgm @@ -1892,18 +1884,17 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s2, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s2, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s2, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s2, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -2142,7 +2133,6 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 @@ -2154,16 +2144,16 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s6, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s6, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s6, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s6, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s6, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s6, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s6, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s6, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -2173,7 +2163,6 @@ ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2198,23 +2187,23 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s4, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v1, s4, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v14, s4, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[1:4] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] @@ -2233,27 +2222,26 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s6, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s6, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s6, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s6, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s6, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s6, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s6, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -2622,7 +2610,6 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2640,34 +2627,34 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, s0, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v0 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, s0, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, s0, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, s0, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v8 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s0, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, s0, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v12 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 @@ -2683,7 +2670,6 @@ ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s14, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2722,8 +2708,8 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] @@ -2731,31 +2717,31 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v10 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] @@ -2763,16 +2749,16 @@ ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s14, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s14, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s14, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s14, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] @@ -2793,45 +2779,42 @@ ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s0, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v2 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s0, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s0, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s0, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v4 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, s0, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v11 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v10 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s0, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v9 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v8 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s0, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s0, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v12 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 @@ -3494,7 +3477,6 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v15 @@ -3504,16 +3486,16 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, s0, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xffff, v13 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, s0, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill @@ -3521,57 +3503,57 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v19 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v18 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v18 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s0, v17 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s0, v16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v26 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v26 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, s0, v24 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v27 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v26 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v31 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, s0, v30 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v30 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, s0, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, s0, v28 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v29 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v28 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v35 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, s0, v38 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, s0, v37 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, s0, v36 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, s0, v35 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v38 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v37 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v36 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v35 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v40 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v39 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, s0, v42 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, s0, v41 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v42 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v41 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, s0, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, s0, v39 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v39 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v58 @@ -3579,19 +3561,19 @@ ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v56 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, s0, v58 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, s0, v57 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s0, v56 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v55 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v58 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v57 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v56 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v41 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v42 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v41 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v39 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v42 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 @@ -3632,7 +3614,6 @@ ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s12, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3676,8 +3657,8 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s12, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s12, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1] @@ -3695,16 +3676,16 @@ ; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s12, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s12, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 @@ -3714,8 +3695,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 @@ -3724,15 +3705,15 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 @@ -3743,30 +3724,30 @@ ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s12, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v15 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s12, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v18 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[1:4] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 @@ -3781,43 +3762,43 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v32 -; GCN-HSA-NEXT: v_and_b32_e32 v14, s12, v33 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s12, v32 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v33 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s12, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s12, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v22 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v35 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v34 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s12, v35 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s12, v34 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v35 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v34 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v29 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s12, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s12, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v28 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -3842,76 +3823,75 @@ ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, 0xffff ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, s4, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, 0xffff, v11 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, s4, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xffff, v10 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s4, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v9 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s4, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s4, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s4, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s4, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s4, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, s4, v27 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v27 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s4, v26 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v26 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, s4, v25 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v25 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s4, v24 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s4, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v31 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, s4, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v30 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s4, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v29 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s4, v28 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v28 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, s4, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, 0xffff, v35 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, s4, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v34 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, s4, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v33 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s4, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v32 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, s4, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v23 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s4, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v22 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s4, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v21 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s4, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v19 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s4, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v19 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, s4, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xffff, v18 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s4, v17 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v17 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s4, v16 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, s4, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, 0xffff, v15 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, s4, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v14 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s4, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v13 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s4, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 @@ -5761,7 +5741,6 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1 @@ -5770,8 +5749,8 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s2, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -5779,7 +5758,6 @@ ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5796,9 +5774,9 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s4, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s4, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: s_endpgm @@ -5814,7 +5792,6 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 @@ -5822,9 +5799,9 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s2, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v9 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s2, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -6065,15 +6042,14 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, 0xffff -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0 @@ -6083,30 +6059,30 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s12, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s12, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s12, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s12, v3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v12 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6121,24 +6097,23 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s4, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v12 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s4, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s4, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] @@ -6156,7 +6131,6 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 @@ -6168,13 +6142,13 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v17 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s6, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s6, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s6, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 @@ -6524,27 +6498,26 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s0, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, s0, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v6 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, s0, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v20 @@ -6577,10 +6550,10 @@ ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6598,29 +6571,28 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v7 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 @@ -6628,13 +6600,13 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s6, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[7:10] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 @@ -6642,10 +6614,10 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s6, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 @@ -6667,9 +6639,9 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v28 @@ -6681,27 +6653,26 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, s0, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, s0, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v4 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v28 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v28 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96 @@ -7301,7 +7272,6 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 @@ -7316,48 +7286,48 @@ ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, s0, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, s0, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, s0, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, s0, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v9 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, s0, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v10 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v12 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, s0, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v11 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, s0, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v13 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, s0, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v14 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v1 @@ -7439,7 +7409,6 @@ ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s18, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 @@ -7482,53 +7451,53 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s13 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[5:8] ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v18 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[5:8] -; GCN-HSA-NEXT: v_and_b32_e32 v3, s18, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v15 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[5:8] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v13 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 @@ -7539,26 +7508,26 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v19 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v17 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v11 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[5:8] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s18, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 @@ -7566,7 +7535,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_and_b32_e32 v17, s18, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v9 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -7575,7 +7544,7 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s18, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -7603,39 +7572,27 @@ ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, s0, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v33 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s0, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v36 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s0, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v38 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v37 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, s0, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, s0, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, s0, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, s0, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, s0, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s0, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s0, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, s0, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, s0, v35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v37 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v35 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v37 @@ -7647,7 +7604,9 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v31 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v37 @@ -7655,9 +7614,15 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v37 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v34 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v37 @@ -7668,10 +7633,13 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144 @@ -7680,6 +7648,7 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v37 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -485,13 +485,12 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b32 s4, 0xffff ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; CI-NEXT: v_and_b32_e32 v0, s4, v0 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; CI-NEXT: v_and_b32_e32 v1, s4, v1 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 ; CI-NEXT: v_lshrrev_b32_e32 v1, v3, v1 @@ -573,13 +572,12 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b32 s4, 0xff00ff ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CI-NEXT: v_and_b32_e32 v3, s4, v3 -; CI-NEXT: v_and_b32_e32 v2, s4, v2 +; CI-NEXT: v_and_b32_e32 v3, 0xff00ff, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll --- a/llvm/test/CodeGen/AMDGPU/madmk.ll +++ b/llvm/test/CodeGen/AMDGPU/madmk.ll @@ -31,9 +31,8 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000 -; GCN-DAG: v_mac_f32_e32 [[VB]], [[SK]], [[VA]] -; GCN-DAG: v_mac_f32_e32 [[VC]], [[SK]], [[VA]] +; GCN-DAG: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]] +; GCN-DAG: v_mac_f32_e32 [[VC]], 0x41200000, [[VA]] ; GCN: s_endpgm define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll --- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll @@ -3,9 +3,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}v_mul_i16: -; SI: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} -; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]] -; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]] +; SI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; SI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} ; SI: v_mul_u32_u24 ; GFX89: v_mul_lo_u16_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -824,13 +824,12 @@ ; GCN-LABEL: test_umul24_anyextend_i23_src0_src1: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x7fffff -; GCN-NEXT: v_and_b32_e32 v0, s4, v0 -; GCN-NEXT: v_and_b32_e32 v1, s4, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 0xea, v0 ; GCN-NEXT: v_mul_u32_u24_e32 v1, 0x39b, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0x7ffffe, v0 -; GCN-NEXT: v_and_b32_e32 v1, s4, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 ; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0x1fffe, v0 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x63, v0 diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll --- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll @@ -89,11 +89,10 @@ ret void } ; CHECK-LABEL: {{^}}vector_imm: -; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64 -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}} define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) #1 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -50,8 +50,8 @@ } ; GCN-LABEL: {{^}}fadd_v2_v_imm: -; GCN: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}} +; GFX90A: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 +; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} ; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @fadd_v2_v_imm(<2 x float> addrspace(1)* %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -261,8 +261,8 @@ } ; GCN-LABEL: {{^}}fmul_v2_v_imm: -; GCN: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}} +; GFX90A: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 +; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} ; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @fmul_v2_v_imm(<2 x float> addrspace(1)* %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -552,8 +552,7 @@ } ; GCN-LABEL: {{^}}fneg_v2f32_vec: -; GFX900: s_brev_b32 [[SIGN:s[0-9]+]], 1 -; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, [[SIGN]], v{{[0-9]+}} +; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} ; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}} define amdgpu_kernel void @fneg_v2f32_vec(<2 x float> addrspace(1)* %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll --- a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll @@ -2,9 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}s_movk_i32_k0: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -16,9 +15,8 @@ } ; SI-LABEL: {{^}}s_movk_i32_k1: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x7fff, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -32,7 +30,7 @@ ; SI-LABEL: {{^}}s_movk_i32_k2: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x7fff, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -44,9 +42,8 @@ } ; SI-LABEL: {{^}}s_movk_i32_k3: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x8000, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -58,9 +55,8 @@ } ; SI-LABEL: {{^}}s_movk_i32_k4: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x20000, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -72,11 +68,9 @@ } ; SI-LABEL: {{^}}s_movk_i32_k5: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffffef, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xff00ffff, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -87,9 +81,8 @@ } ; SI-LABEL: {{^}}s_movk_i32_k6: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x41, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -101,11 +94,9 @@ } ; SI-LABEL: {{^}}s_movk_i32_k7: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}} -; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x2000, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x4000, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -116,11 +107,9 @@ } ; SI-LABEL: {{^}}s_movk_i32_k8: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8000, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -131,11 +120,9 @@ } ; SI-LABEL: {{^}}s_movk_i32_k9: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8001, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -146,11 +133,9 @@ } ; SI-LABEL: {{^}}s_movk_i32_k10: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8888, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -161,11 +146,9 @@ } ; SI-LABEL: {{^}}s_movk_i32_k11: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8fff, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -176,11 +159,9 @@ } ; SI-LABEL: {{^}}s_movk_i32_k12: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} ; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]], -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff7001, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -134,13 +134,11 @@ ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -156,15 +154,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -198,18 +195,16 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 +; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -224,22 +219,21 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 @@ -273,29 +267,26 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -308,15 +299,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -327,14 +317,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -368,15 +358,14 @@ ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -388,15 +377,14 @@ ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: s_brev_b32 s6, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -401,7 +401,6 @@ ; GCN-NEXT: s_mov_b32 s8, s2 ; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -421,8 +420,8 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 -; GCN-NEXT: v_mul_f32_e32 v5, s2, v5 -; GCN-NEXT: v_mul_f32_e32 v7, s2, v7 +; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; GCN-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 @@ -475,7 +474,6 @@ ; TONGA-NEXT: s_mov_b32 s8, s2 ; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s2, 0x4f7ffffe ; TONGA-NEXT: s_mov_b32 s4, s0 ; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) @@ -495,8 +493,8 @@ ; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v3 -; TONGA-NEXT: v_mul_f32_e32 v5, s2, v5 -; TONGA-NEXT: v_mul_f32_e32 v7, s2, v7 +; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; TONGA-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 ; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v7 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 @@ -549,7 +547,6 @@ ; GFX9-NEXT: s_mov_b32 s8, s2 ; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -567,8 +564,8 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GFX9-NEXT: v_mul_f32_e32 v6, s2, v6 -; GFX9-NEXT: v_mul_f32_e32 v7, s2, v7 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v8 @@ -807,123 +804,122 @@ ; GCN-NEXT: s_mov_b32 s5, s3 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GCN-NEXT: s_mov_b32 s8, s0 ; GCN-NEXT: s_mov_b32 s9, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 -; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 -; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 -; GCN-NEXT: v_cvt_f32_u32_e32 v9, v5 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 +; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 ; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 +; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 ; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GCN-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GCN-NEXT: v_mul_lo_u32 v9, v9, v8 +; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 ; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 -; GCN-NEXT: v_mul_f32_e32 v9, s2, v9 ; GCN-NEXT: v_xor_b32_e32 v16, v10, v11 +; GCN-NEXT: v_mul_hi_u32 v9, v8, v9 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 +; GCN-NEXT: v_cvt_f32_u32_e32 v10, v5 ; GCN-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 ; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 +; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v10 +; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v11 +; GCN-NEXT: v_mul_lo_u32 v11, v8, v4 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 ; GCN-NEXT: v_xor_b32_e32 v17, v12, v13 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 -; GCN-NEXT: v_mul_f32_e32 v8, s2, v8 ; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 +; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v8 -; GCN-NEXT: v_mul_hi_u32 v12, v9, v12 -; GCN-NEXT: v_mul_f32_e32 v11, s2, v11 -; GCN-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GCN-NEXT: v_mul_hi_u32 v10, v8, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v6 -; GCN-NEXT: v_mul_lo_u32 v12, v12, v11 -; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 -; GCN-NEXT: v_mul_hi_u32 v12, v11, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 ; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GCN-NEXT: v_xor_b32_e32 v7, v7, v14 -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GCN-NEXT: v_mul_lo_u32 v12, v8, v4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v9 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v11 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] -; GCN-NEXT: v_mul_f32_e32 v10, s2, v10 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_mul_lo_u32 v0, v9, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v10 -; GCN-NEXT: v_mul_lo_u32 v10, v11, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v9 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v6 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v10 +; GCN-NEXT: v_xor_b32_e32 v4, v7, v14 +; GCN-NEXT: v_mul_hi_u32 v7, v9, v12 +; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4 +; GCN-NEXT: v_mul_hi_u32 v0, v10, v0 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v7 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GCN-NEXT: v_mul_hi_u32 v0, v2, v0 +; GCN-NEXT: v_mul_lo_u32 v10, v7, v5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 +; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 +; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] -; GCN-NEXT: v_sub_i32_e32 v9, vcc, v0, v5 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v7 +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[2:3] +; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v0 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GCN-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5] -; GCN-NEXT: v_sub_i32_e32 v11, vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[2:3] -; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v1 +; GCN-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[4:5] +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[2:3] +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v2, v6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v7 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v8, v15 ; GCN-NEXT: v_xor_b32_e32 v5, v0, v16 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v15 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v16 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v7 -; GCN-NEXT: v_mul_lo_u32 v5, v5, v4 -; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GCN-NEXT: v_mul_hi_u32 v5, v4, v5 -; GCN-NEXT: v_xor_b32_e32 v3, v3, v9 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[4:5] -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v10 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v9, v12 +; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GCN-NEXT: v_mul_hi_u32 v5, v12, v5 +; GCN-NEXT: v_xor_b32_e32 v3, v3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v10 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GCN-NEXT: v_mul_hi_u32 v5, v3, v5 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc ; GCN-NEXT: v_xor_b32_e32 v2, v2, v17 -; GCN-NEXT: v_mul_lo_u32 v5, v4, v7 +; GCN-NEXT: v_mul_lo_u32 v6, v5, v4 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 -; GCN-NEXT: v_xor_b32_e32 v6, v9, v14 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v7 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v7 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v3, v3, v6 +; GCN-NEXT: v_xor_b32_e32 v7, v8, v14 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v4 +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v6, vcc, v3, v4 +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; @@ -939,123 +935,122 @@ ; TONGA-NEXT: s_mov_b32 s5, s3 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; TONGA-NEXT: s_mov_b32 s2, 0x4f7ffffe ; TONGA-NEXT: s_mov_b32 s8, s0 ; TONGA-NEXT: s_mov_b32 s9, s1 ; TONGA-NEXT: s_waitcnt vmcnt(1) ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 ; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 -; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 -; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 -; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 -; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v5 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0 +; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 +; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 ; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v4 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v4 +; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5 +; TONGA-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 +; TONGA-NEXT: v_mul_lo_u32 v9, v9, v8 +; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 ; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13 -; TONGA-NEXT: v_mul_f32_e32 v9, s2, v9 ; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11 +; TONGA-NEXT: v_mul_hi_u32 v9, v8, v9 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 +; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v5 ; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v6 -; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v9, v8 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 +; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v10 +; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v11 +; TONGA-NEXT: v_mul_lo_u32 v11, v8, v4 +; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2 ; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12 -; TONGA-NEXT: v_mul_f32_e32 v8, s2, v8 ; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v5 -; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v11 +; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 ; TONGA-NEXT: v_mul_lo_u32 v12, v12, v9 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 -; TONGA-NEXT: v_mul_lo_u32 v10, v10, v8 -; TONGA-NEXT: v_mul_hi_u32 v12, v9, v12 -; TONGA-NEXT: v_mul_f32_e32 v11, s2, v11 -; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11 -; TONGA-NEXT: v_mul_hi_u32 v10, v8, v10 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v12, v9 -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v6 -; TONGA-NEXT: v_mul_lo_u32 v12, v12, v11 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 -; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 -; TONGA-NEXT: v_mul_hi_u32 v12, v11, v12 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v0, v4 ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7 -; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14 -; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v7 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v12, v11 -; TONGA-NEXT: v_mul_lo_u32 v12, v8, v4 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9 -; TONGA-NEXT: v_mul_hi_u32 v11, v2, v11 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] -; TONGA-NEXT: v_mul_f32_e32 v10, s2, v10 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_mul_lo_u32 v0, v9, v5 -; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v10 -; TONGA-NEXT: v_mul_lo_u32 v10, v11, v6 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v9 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v6 +; TONGA-NEXT: v_mul_lo_u32 v0, v0, v10 +; TONGA-NEXT: v_xor_b32_e32 v4, v7, v14 +; TONGA-NEXT: v_mul_hi_u32 v7, v9, v12 +; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4 +; TONGA-NEXT: v_mul_hi_u32 v0, v10, v0 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v9 +; TONGA-NEXT: v_mul_hi_u32 v7, v1, v7 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v10 +; TONGA-NEXT: v_mul_hi_u32 v0, v2, v0 +; TONGA-NEXT: v_mul_lo_u32 v10, v7, v5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v4 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 +; TONGA-NEXT: v_mul_lo_u32 v10, v0, v6 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 +; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 ; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v11 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v0, v5 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v7 +; TONGA-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[2:3] +; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v0 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5] -; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[2:3] -; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v1 +; TONGA-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[4:5] +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[2:3] +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v2, v6 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] +; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v7 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc +; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc ; TONGA-NEXT: v_xor_b32_e32 v1, v8, v15 ; TONGA-NEXT: v_xor_b32_e32 v5, v0, v16 ; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v15, v1 ; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v16, v5 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v7 -; TONGA-NEXT: v_mul_lo_u32 v5, v5, v4 -; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v9, v3 -; TONGA-NEXT: v_mul_hi_u32 v5, v4, v5 -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v9 -; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[4:5] -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v10 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4 -; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 +; TONGA-NEXT: v_mul_lo_u32 v5, v9, v12 +; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v8, v3 +; TONGA-NEXT: v_mul_hi_u32 v5, v12, v5 +; TONGA-NEXT: v_xor_b32_e32 v3, v3, v8 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v10 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v12 +; TONGA-NEXT: v_mul_hi_u32 v5, v3, v5 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17 -; TONGA-NEXT: v_mul_lo_u32 v5, v4, v7 +; TONGA-NEXT: v_mul_lo_u32 v6, v5, v4 ; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v17, v2 -; TONGA-NEXT: v_xor_b32_e32 v6, v9, v14 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v7 -; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v3, v7 -; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; TONGA-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6 -; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; TONGA-NEXT: v_xor_b32_e32 v7, v8, v14 +; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v3, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; TONGA-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 +; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v7, v3 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; @@ -1071,7 +1066,6 @@ ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX9-NEXT: s_mov_b32 s8, s0 ; GFX9-NEXT: s_mov_b32 s9, s1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1112,13 +1106,13 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; GFX9-NEXT: v_mul_f32_e32 v8, s2, v8 +; GFX9-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX9-NEXT: v_mul_f32_e32 v10, s2, v10 -; GFX9-NEXT: v_mul_f32_e32 v12, s2, v12 +; GFX9-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; GFX9-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GFX9-NEXT: v_sub_u32_e32 v9, 0, v4 -; GFX9-NEXT: v_mul_f32_e32 v14, s2, v14 +; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GFX9-NEXT: v_mul_lo_u32 v9, v9, v8 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1794,7 +1794,7 @@ ; GCN-NEXT: s_or_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mul_f32_e32 v1, s3, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v2, -v1, v0, s3 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -1821,7 +1821,7 @@ ; GCN-IR-NEXT: s_or_b32 s0, s0, 1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s3, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -1906,7 +1906,7 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -1926,7 +1926,7 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -1951,7 +1951,7 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -1971,7 +1971,7 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -507,9 +507,9 @@ ; Check that "pulling out" SDWA operands works correctly. ; GCN-LABEL: {{^}}pulled_out_test: -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, v{{[0-9]+}} ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, v{{[0-9]+}} ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_and_b32_sdwa diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -639,7 +639,6 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 @@ -650,8 +649,8 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, v9, v7 ; SI-NEXT: v_lshlrev_b32_e32 v3, v8, v6 -; SI-NEXT: v_and_b32_e32 v1, s4, v1 -; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -490,7 +490,6 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b32 s4, 0xffff ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 @@ -501,8 +500,8 @@ ; CI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; CI-NEXT: v_lshlrev_b32_e32 v2, v9, v7 ; CI-NEXT: v_lshlrev_b32_e32 v3, v8, v6 -; CI-NEXT: v_and_b32_e32 v1, s4, v1 -; CI-NEXT: v_and_b32_e32 v0, s4, v0 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -555,7 +554,6 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s2, 0xff000000 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -564,8 +562,8 @@ ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_and_b32_e32 v4, s2, v4 -; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xff000000, v4 +; VI-NEXT: v_and_b32_e32 v0, 0xff000000, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -581,14 +579,13 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b32 s4, 0xff00 ; CI-NEXT: s_mov_b64 s[2:3], s[6:7] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; CI-NEXT: v_and_b32_e32 v4, s4, v4 +; CI-NEXT: v_and_b32_e32 v4, 0xff00, v4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; CI-NEXT: v_and_b32_e32 v3, s4, v3 +; CI-NEXT: v_and_b32_e32 v3, 0xff00, v3 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -240,7 +240,6 @@ ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s2, 0xffff ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -255,9 +254,9 @@ ; SI-NEXT: v_ashr_i32_e32 v0, v0, v6 ; SI-NEXT: v_ashr_i32_e32 v2, v4, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, s2, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll @@ -64,8 +64,7 @@ ; CHECK-NEXT: v_bfe_i32 v5, v0, 0, 31 ; CHECK-NEXT: s_mov_b32 s4, 0x38e38e39 ; CHECK-NEXT: s_mov_b32 s5, 0xc71c71c7 -; CHECK-NEXT: s_brev_b32 s6, -2 -; CHECK-NEXT: s_mov_b32 s7, 0x7ffffffd +; CHECK-NEXT: s_mov_b32 s6, 0x7ffffffd ; CHECK-NEXT: v_mul_hi_i32 v5, v5, s4 ; CHECK-NEXT: v_mul_hi_i32 v4, v4, s4 ; CHECK-NEXT: v_mul_hi_i32 v3, v3, s5 @@ -84,12 +83,12 @@ ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_and_b32_e32 v2, s6, v2 -; CHECK-NEXT: v_and_b32_e32 v1, s6, v1 -; CHECK-NEXT: v_and_b32_e32 v0, s6, v0 +; CHECK-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1964,7 +1964,7 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mul_f32_e32 v1, s6, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v2, -v1, v0, s6 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -1991,7 +1991,7 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s6, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s6 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -2084,7 +2084,7 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v0 ; GCN-NEXT: v_or_b32_e32 v3, 1, v3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v4, -v2, v1, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -2106,7 +2106,7 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 30, v0 ; GCN-IR-NEXT: v_or_b32_e32 v3, 1, v3 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v4, -v2, v1, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -2133,7 +2133,7 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v0 ; GCN-NEXT: v_or_b32_e32 v3, 1, v3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v4, -v2, v1, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -2155,7 +2155,7 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 30, v0 ; GCN-IR-NEXT: v_or_b32_e32 v3, 1, v3 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v4, -v2, v1, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -134,13 +134,11 @@ ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -156,15 +154,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -198,21 +195,18 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -225,22 +219,21 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 @@ -274,29 +267,26 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -309,15 +299,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -328,14 +317,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -369,15 +358,14 @@ ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v2 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -389,15 +377,14 @@ ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v0, v2 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: s_brev_b32 s6, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -428,22 +415,21 @@ ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -455,22 +441,21 @@ ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: s_brev_b32 s6, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -503,29 +488,28 @@ ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v4 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -537,29 +521,28 @@ ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v0, v4 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX8-NEXT: s_brev_b32 s6, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -594,57 +577,56 @@ ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v8 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v5, s6, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -656,57 +638,56 @@ ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v0, v8 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX8-NEXT: s_brev_b32 s6, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v5, s6, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -748,37 +729,36 @@ ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v16 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 -; GFX6-NEXT: s_brev_b32 s6, 1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v1, v17 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 @@ -786,70 +766,70 @@ ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v5, s6, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v6, v22 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v23 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v8, v24 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v8, s6, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v9, v25 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 ; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v9, s6, v9 +; GFX6-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v10, v26 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v10, s6, v10 +; GFX6-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v11, v27 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 ; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v11, s6, v11 +; GFX6-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v12, v28 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v12, s6, v12 +; GFX6-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v13, v29 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 ; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v13, s6, v13 +; GFX6-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc ; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v14, v30 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v14, s6, v14 +; GFX6-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -857,7 +837,7 @@ ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v16 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v15, s6, v15 +; GFX6-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -868,37 +848,36 @@ ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v0, v16 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 -; GFX8-NEXT: s_brev_b32 s6, 1 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v1, v17 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 @@ -906,70 +885,70 @@ ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v5, s6, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v6, v22 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v7, v23 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v8, v24 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v8, s6, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v9, v25 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v9, s6, v9 +; GFX8-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v10, v26 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v10, s6, v10 +; GFX8-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v11, v27 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v11, s6, v11 +; GFX8-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v12, v28 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v12, s6, v12 +; GFX8-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v13, v29 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v13, s6, v13 +; GFX8-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc ; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v14, v30 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v14, s6, v14 +; GFX8-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -977,7 +956,7 @@ ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v16 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v15, s6, v15 +; GFX8-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -94,10 +94,9 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX9-NEXT: v_fma_f16 v0, v0, v2, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_fma_f16 v7, v9, v8, v7 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -578,7 +578,7 @@ ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v4, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -8,11 +8,10 @@ ; GFX6-LABEL: v_uaddsat_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 0xff, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i8: @@ -46,11 +45,10 @@ ; GFX6-LABEL: v_uaddsat_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i16: @@ -110,15 +108,14 @@ ; GFX6-LABEL: v_uaddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -152,20 +149,19 @@ ; GFX6-LABEL: v_uaddsat_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_u32_e32 v3, s4, v2 +; GFX6-NEXT: v_min_u32_e32 v3, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -202,25 +198,24 @@ ; GFX6-LABEL: v_uaddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v7 -; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_u32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -343,7 +343,6 @@ ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s2, 0x4f7ffffe ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -353,8 +352,8 @@ ; SI-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; SI-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; SI-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 -; SI-NEXT: v_mul_f32_e32 v4, s2, v4 -; SI-NEXT: v_mul_f32_e32 v5, s2, v5 +; SI-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; SI-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 ; SI-NEXT: v_cvt_u32_f32_e32 v4, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v5, v5 ; SI-NEXT: v_mul_lo_u32 v6, v6, v4 @@ -399,7 +398,6 @@ ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s2, 0x4f7ffffe ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -409,8 +407,8 @@ ; VI-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; VI-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; VI-NEXT: v_sub_u32_e32 v7, vcc, 0, v3 -; VI-NEXT: v_mul_f32_e32 v4, s2, v4 -; VI-NEXT: v_mul_f32_e32 v5, s2, v5 +; VI-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; VI-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 ; VI-NEXT: v_cvt_u32_f32_e32 v4, v4 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v5 ; VI-NEXT: v_mul_lo_u32 v6, v6, v4 @@ -451,15 +449,14 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GCN-NEXT: v_mul_f32_e32 v4, s2, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GCN-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GCN-NEXT: v_mul_f32_e32 v5, s2, v5 +; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v5 ; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 ; GCN-NEXT: v_mul_lo_u32 v5, v4, v6 @@ -612,7 +609,6 @@ ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; SI-NEXT: s_mov_b32 s2, 0x4f7ffffe ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -624,10 +620,10 @@ ; SI-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; SI-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; SI-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; SI-NEXT: v_mul_f32_e32 v8, s2, v8 -; SI-NEXT: v_mul_f32_e32 v10, s2, v10 -; SI-NEXT: v_mul_f32_e32 v12, s2, v12 -; SI-NEXT: v_mul_f32_e32 v14, s2, v14 +; SI-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; SI-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; SI-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; SI-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 ; SI-NEXT: v_cvt_u32_f32_e32 v8, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v10, v10 ; SI-NEXT: v_cvt_u32_f32_e32 v12, v12 @@ -708,7 +704,6 @@ ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 -; VI-NEXT: s_mov_b32 s2, 0x4f7ffffe ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -720,10 +715,10 @@ ; VI-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; VI-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; VI-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; VI-NEXT: v_mul_f32_e32 v8, s2, v8 -; VI-NEXT: v_mul_f32_e32 v10, s2, v10 -; VI-NEXT: v_mul_f32_e32 v12, s2, v12 -; VI-NEXT: v_mul_f32_e32 v14, s2, v14 +; VI-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; VI-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; VI-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; VI-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 ; VI-NEXT: v_cvt_u32_f32_e32 v8, v8 ; VI-NEXT: v_cvt_u32_f32_e32 v10, v10 ; VI-NEXT: v_cvt_u32_f32_e32 v12, v12 @@ -804,7 +799,6 @@ ; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: v_mov_b32_e32 v9, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) @@ -816,10 +810,10 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 ; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 -; GCN-NEXT: v_mul_f32_e32 v10, s2, v10 -; GCN-NEXT: v_mul_f32_e32 v12, s2, v12 -; GCN-NEXT: v_mul_f32_e32 v14, s2, v14 -; GCN-NEXT: v_mul_f32_e32 v16, s2, v16 +; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 +; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 ; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1700,7 +1700,7 @@ ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s4 @@ -1721,7 +1721,7 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s4 @@ -1795,7 +1795,7 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s4 @@ -1812,7 +1812,7 @@ ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s4 @@ -1834,7 +1834,7 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x47000000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s4 @@ -1851,7 +1851,7 @@ ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x47000000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -157,19 +157,18 @@ ; GFX6-LABEL: test_udivrem_v2: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX6-NEXT: s_sub_i32 s2, 0, s7 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 @@ -177,22 +176,22 @@ ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s7, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -202,18 +201,17 @@ ; GFX8-LABEL: test_udivrem_v2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX8-NEXT: s_sub_i32 s2, 0, s6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: s_sub_i32 s2, 0, s6 ; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX8-NEXT: s_sub_i32 s2, 0, s7 ; GFX8-NEXT: v_mul_lo_u32 v3, s2, v1 @@ -231,11 +229,11 @@ ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -320,26 +318,25 @@ ; GFX6-LABEL: test_udivrem_v4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_i32 s2, 0, s8 -; GFX6-NEXT: s_sub_i32 s12, 0, s9 +; GFX6-NEXT: s_sub_i32 s12, 0, s8 +; GFX6-NEXT: s_sub_i32 s13, 0, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 -; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 @@ -347,7 +344,7 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -367,7 +364,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: s_sub_i32 s4, 0, s11 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 @@ -399,7 +396,6 @@ ; GFX8-LABEL: test_udivrem_v4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s12, 0x4f7ffffe ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 @@ -410,9 +406,9 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s11 -; GFX8-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, s12, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0 @@ -425,18 +421,18 @@ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX8-NEXT: v_mul_f32_e32 v2, s12, v3 +; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v0 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, s2, v2 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -444,7 +440,7 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX8-NEXT: s_sub_i32 s2, 0, s11 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, s12, v4 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s6, v2 diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll @@ -5,12 +5,11 @@ ; CHECK-LABEL: test_urem_odd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1fff -; CHECK-NEXT: s_movk_i32 s5, 0x667 -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x1fff, v0 +; CHECK-NEXT: s_movk_i32 s4, 0x667 ; CHECK-NEXT: v_mul_u32_u24_e32 v0, 0xccd, v0 -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s5, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x1fff, v0 +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %urem = urem i13 %X, 5 @@ -56,10 +55,9 @@ ; CHECK-LABEL: test_urem_negative_odd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1ff -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x1ff, v0 ; CHECK-NEXT: v_mul_u32_u24_e32 v0, 0x133, v0 -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x1ff, v0 ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -72,27 +70,26 @@ ; CHECK-LABEL: test_urem_vec: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x7ff -; CHECK-NEXT: s_mov_b32 s5, 0x8311eb33 -; CHECK-NEXT: s_mov_b32 s6, 0x20140c -; CHECK-NEXT: s_mov_b32 s7, 0xb6db6db7 -; CHECK-NEXT: s_mov_b32 s8, 0x24924924 -; CHECK-NEXT: s_mov_b32 s9, 0xaaaaaaab -; CHECK-NEXT: s_mov_b32 s10, 0x2aaaaaaa -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; CHECK-NEXT: v_and_b32_e32 v1, s4, v1 -; CHECK-NEXT: v_and_b32_e32 v2, s4, v2 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, s5 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, s7 -; CHECK-NEXT: v_mul_lo_u32 v0, v0, s9 +; CHECK-NEXT: v_and_b32_e32 v0, 0x7ff, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7ff, v1 +; CHECK-NEXT: v_and_b32_e32 v2, 0x7ff, v2 +; CHECK-NEXT: s_mov_b32 s4, 0x8311eb33 +; CHECK-NEXT: s_mov_b32 s5, 0x20140c +; CHECK-NEXT: s_mov_b32 s6, 0xb6db6db7 +; CHECK-NEXT: s_mov_b32 s7, 0x24924924 +; CHECK-NEXT: s_mov_b32 s8, 0xaaaaaaab +; CHECK-NEXT: s_mov_b32 s9, 0x2aaaaaaa +; CHECK-NEXT: v_mul_lo_u32 v2, v2, s4 +; CHECK-NEXT: v_mul_lo_u32 v1, v1, s6 +; CHECK-NEXT: v_mul_lo_u32 v0, v0, s8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xf9dc299a, v2 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, 0x49249249, v1 ; CHECK-NEXT: v_alignbit_b32 v0, v0, v0, 1 -; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s9, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s8, v1 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s6, v2 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %urem = urem <3 x i11> %X, diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -486,36 +486,35 @@ ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-NEXT: s_lshr_b32 s5, s9, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GCN-NEXT: s_lshr_b32 s6, s11, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GCN-NEXT: s_lshr_b32 s7, s7, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, s7 -; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GCN-NEXT: v_mul_f32_e32 v4, v2, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GCN-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GCN-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: s_lshr_b32 s5, s5, 1 +; GCN-NEXT: s_lshr_b32 s6, s7, 1 +; GCN-NEXT: s_lshr_b32 s7, s11, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s7 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GCN-NEXT: v_mul_f32_e32 v2, v5, v6 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GCN-NEXT: v_mad_f32 v2, -v2, v3, v5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, s6 -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_and_b32_e32 v0, s4, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v4, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v2, s7 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, s4, v2 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; @@ -527,36 +526,35 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-IR-NEXT: s_lshr_b32 s5, s9, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GCN-IR-NEXT: s_lshr_b32 s6, s11, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GCN-IR-NEXT: s_lshr_b32 s7, s7, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v5, s7 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GCN-IR-NEXT: v_mul_f32_e32 v4, v2, v4 -; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GCN-IR-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: s_lshr_b32 s5, s5, 1 +; GCN-IR-NEXT: s_lshr_b32 s6, s7, 1 +; GCN-IR-NEXT: s_lshr_b32 s7, s11, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s7 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GCN-IR-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v5, v6 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s5 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GCN-IR-NEXT: v_mad_f32 v2, -v2, v3, v5 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s6 -; GCN-IR-NEXT: s_brev_b32 s4, -2 -; GCN-IR-NEXT: v_and_b32_e32 v0, s4, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GCN-IR-NEXT: v_mad_f32 v2, -v2, v4, v5 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s7 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 -; GCN-IR-NEXT: v_and_b32_e32 v2, s4, v2 ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, @@ -632,36 +630,35 @@ ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-NEXT: s_lshr_b32 s5, s9, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GCN-NEXT: s_lshr_b32 s6, s11, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GCN-NEXT: s_lshr_b32 s7, s7, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, s7 -; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GCN-NEXT: v_mul_f32_e32 v4, v2, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GCN-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GCN-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: s_lshr_b32 s5, s5, 1 +; GCN-NEXT: s_lshr_b32 s6, s7, 9 +; GCN-NEXT: s_lshr_b32 s7, s11, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s7 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GCN-NEXT: v_mul_f32_e32 v2, v5, v6 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GCN-NEXT: v_mad_f32 v2, -v2, v3, v5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, s6 -; GCN-NEXT: s_brev_b32 s4, -2 -; GCN-NEXT: v_and_b32_e32 v0, s4, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v4, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v2, s7 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 -; GCN-NEXT: v_and_b32_e32 v2, s4, v2 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; @@ -673,36 +670,35 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s4, s5, 1 -; GCN-IR-NEXT: s_lshr_b32 s5, s9, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GCN-IR-NEXT: s_lshr_b32 s6, s11, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GCN-IR-NEXT: s_lshr_b32 s7, s7, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v5, s7 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GCN-IR-NEXT: v_mul_f32_e32 v4, v2, v4 -; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GCN-IR-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: s_lshr_b32 s5, s5, 1 +; GCN-IR-NEXT: s_lshr_b32 s6, s7, 9 +; GCN-IR-NEXT: s_lshr_b32 s7, s11, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s7 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v5, s6 +; GCN-IR-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v5, v6 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s5 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GCN-IR-NEXT: v_mad_f32 v2, -v2, v3, v5 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s6 -; GCN-IR-NEXT: s_brev_b32 s4, -2 -; GCN-IR-NEXT: v_and_b32_e32 v0, s4, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GCN-IR-NEXT: v_mad_f32 v2, -v2, v4, v5 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s7 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 -; GCN-IR-NEXT: v_and_b32_e32 v2, s4, v2 ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, @@ -1375,7 +1371,7 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s5, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s5 @@ -1398,7 +1394,7 @@ ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s5, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s5 @@ -1480,7 +1476,7 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v2, -v2, v1, s4 @@ -1499,7 +1495,7 @@ ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v2, -v2, v1, s4 @@ -1523,7 +1519,7 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v2, -v2, v1, s4 @@ -1542,7 +1538,7 @@ ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v2, -v2, v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -8,9 +8,8 @@ ; GFX6-LABEL: v_usubsat_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -43,9 +42,8 @@ ; GFX6-LABEL: v_usubsat_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -214,11 +212,10 @@ ; GFX6-LABEL: v_usubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 @@ -256,16 +253,15 @@ ; GFX6-LABEL: v_usubsat_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v6, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v6 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -305,18 +301,17 @@ ; GFX6-LABEL: v_usubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v9, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v7 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -953,13 +953,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, v2, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_1100: @@ -1366,7 +1365,7 @@ ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1