diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -69,7 +69,7 @@ MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg()) { Register Reg = Src0.getReg(); - if (Reg.isVirtual() && MRI.hasOneUse(Reg)) { + if (Reg.isVirtual()) { MachineInstr *Def = MRI.getUniqueVRegDef(Reg); if (Def && Def->isMoveImmediate()) { MachineOperand &MovSrc = Def->getOperand(1); @@ -91,8 +91,8 @@ } if (ConstantFolded) { - assert(MRI.use_empty(Reg)); - Def->eraseFromParent(); + if (MRI.use_nodbg_empty(Reg)) + Def->eraseFromParent(); ++NumLiteralConstantsFolded; return true; } @@ -644,11 +644,7 @@ } } - // FIXME: We also need to consider movs of constant operands since - // immediate operands are not folded if they have more than one use, and - // the operand folding pass is unaware if the immediate will be free since - // it won't know if the src == dest constraint will end up being - // satisfied. + // Try to use S_ADDK_I32 and S_MULK_I32. if (MI.getOpcode() == AMDGPU::S_ADD_I32 || MI.getOpcode() == AMDGPU::S_MUL_I32) { const MachineOperand *Dest = &MI.getOperand(0); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -94,9 +94,8 @@ ; GFX8-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v1 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -559,12 +559,11 @@ ; GFX6-LABEL: v_andn2_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v0, v0, v1 @@ -816,18 +815,17 @@ ; GFX6-LABEL: v_andn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v8 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v8 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_and_b32_e32 v4, v6, v8 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -718,11 +718,10 @@ ; GFX6-LABEL: v_ashr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -841,15 +840,14 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: ashr_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: v_ashr_i32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: v_ashr_i32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -885,10 +883,9 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -931,25 +928,24 @@ ; GFX6-LABEL: v_ashr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1094,46 +1090,44 @@ ; GFX6-LABEL: v_ashr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v9 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v11 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v16 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, v2, v16 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, v8, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v5, v16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v4, v16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, v7, v16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, v6, v16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -892,18 +892,18 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_lshr_b32 s1, s2, 2 -; GFX7-NEXT: s_and_b32 s2, s2, 3 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: s_and_b32 s1, s2, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_lshl_b32 s0, s1, 3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 @@ -918,7 +918,6 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: s_lshl_b32 s0, s2, 3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog @@ -1019,7 +1018,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 @@ -1028,9 +1026,9 @@ ; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 @@ -2291,58 +2289,56 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v4, 0xff -; GFX7-NEXT: s_lshr_b32 s1, s2, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_and_b32 s2, s2, 3 +; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v9, s0, v0 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 +; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v15 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_or_b32_e32 v11, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7-NEXT: s_lshl_b32 s0, s2, 3 +; GFX7-NEXT: s_lshl_b32 s0, s1, 3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-NEXT: ; return to shader part epilog @@ -2500,57 +2496,55 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff -; GFX7-NEXT: v_mov_b32_e32 v0, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v3 +; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v14, v5, v0 +; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 ; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v6 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v16 -; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v3, v4, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 +; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -866,8 +866,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 @@ -881,7 +880,7 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 @@ -1001,8 +1000,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 @@ -1016,7 +1014,7 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 @@ -1182,8 +1180,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 @@ -1197,7 +1194,7 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -58,15 +58,14 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 4 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -108,14 +107,13 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s32 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, s32, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s32, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -237,14 +235,13 @@ ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x104 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x104, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -292,16 +289,16 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v1, off, s32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, vcc_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, vcc_hi, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -408,14 +405,13 @@ ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4004 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x4004, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -463,16 +459,16 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-NEXT: v_add_u32_e32 v1, vcc_hi, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v3, off +; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, vcc_hi, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -255,7 +255,7 @@ ; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b32 s2, 0x80000000 -; SI-NEXT: v_sub_f32_e32 v2, s2, v2 +; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 ; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| ; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] @@ -290,7 +290,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_sub_f32_e32 v4, s2, v7 +; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 ; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| ; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -308,7 +308,7 @@ ; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s2, 0x80000000 -; GFX9-NEXT: v_sub_f32_e32 v1, s2, v1 +; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| ; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -186,9 +186,8 @@ ; GFX8-LABEL: v_fmul_v4f16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 @@ -223,9 +222,8 @@ ; GFX8-LABEL: v_fmul_v4f16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 @@ -336,10 +334,9 @@ ; GFX8-LABEL: v_fmul_v6f16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 @@ -381,10 +378,9 @@ ; GFX8-LABEL: v_fmul_v6f16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 -; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, 0x80008000, v5 ; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 @@ -515,11 +511,10 @@ ; GFX8-LABEL: v_fmul_v8f16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 @@ -567,11 +562,10 @@ ; GFX8-LABEL: v_fmul_v8f16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 -; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX8-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, 0x80008000, v5 +; GFX8-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 +; GFX8-NEXT: v_xor_b32_e32 v7, 0x80008000, v7 ; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 ; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -398,19 +398,17 @@ ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_mov_b32 s4, 0x80008000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_log_f32_e32 v0, v0 @@ -427,11 +425,10 @@ ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX8-NEXT: v_log_f16_e32 v2, v0 ; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -450,11 +447,10 @@ ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0x80008000 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX9-NEXT: v_log_f16_e32 v2, v0 ; GFX9-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -10,7 +10,6 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s2, s2, 0x7f -; GFX6-NEXT: s_movk_i32 s3, 0x7f ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -27,8 +26,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 6, v0 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshr_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -41,10 +40,9 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f ; GFX8-NEXT: s_and_b32 s1, s1, 0x7f -; GFX8-NEXT: s_movk_i32 s3, 0x7f +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 ; GFX8-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -59,8 +57,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u16_e32 v1, 6, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -73,10 +71,9 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f ; GFX9-NEXT: s_and_b32 s1, s1, 0x7f -; GFX9-NEXT: s_movk_i32 s3, 0x7f +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -91,8 +88,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u16_e32 v1, 6, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -147,7 +144,6 @@ ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7f ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 @@ -157,9 +153,9 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -170,15 +166,14 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 @@ -188,9 +183,9 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -201,15 +196,14 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 @@ -219,9 +213,9 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v3 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -658,9 +652,8 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -996,23 +989,22 @@ ; GFX6-NEXT: v_bfe_u32 v4, v1, 16, 8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1059,12 +1051,11 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1114,8 +1105,8 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1189,10 +1180,9 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX6-NEXT: s_mov_b32 s3, 0xffffff +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -1206,8 +1196,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_lshr_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1220,10 +1210,9 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX8-NEXT: s_mov_b32 s3, 0xffffff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -1237,8 +1226,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1251,10 +1240,9 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX9-NEXT: s_mov_b32 s3, 0xffffff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -1268,8 +1256,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s1 ; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -1321,7 +1309,6 @@ ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 @@ -1331,9 +1318,9 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1352,7 +1339,6 @@ ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 @@ -1362,9 +1348,9 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1383,7 +1369,6 @@ ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 @@ -1393,8 +1378,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1438,11 +1423,11 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 -; GFX6-NEXT: s_and_b32 s10, s0, 0xff +; GFX6-NEXT: s_and_b32 s9, s0, 0xff ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: s_or_b32 s0, s10, s0 +; GFX6-NEXT: s_or_b32 s0, s9, s0 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 @@ -1461,12 +1446,12 @@ ; GFX6-NEXT: s_or_b32 s1, s1, s6 ; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_lshr_b32 s7, s2, 24 -; GFX6-NEXT: s_and_b32 s10, s2, 0xff +; GFX6-NEXT: s_and_b32 s9, s2, 0xff ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: s_or_b32 s2, s10, s2 +; GFX6-NEXT: s_or_b32 s2, s9, s2 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_lshr_b32 s8, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 @@ -1483,13 +1468,13 @@ ; GFX6-NEXT: s_or_b32 s3, s3, s6 ; GFX6-NEXT: s_lshr_b32 s6, s4, 16 ; GFX6-NEXT: s_lshr_b32 s7, s4, 24 -; GFX6-NEXT: s_and_b32 s10, s4, 0xff +; GFX6-NEXT: s_and_b32 s9, s4, 0xff ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX6-NEXT: s_or_b32 s4, s10, s4 +; GFX6-NEXT: s_or_b32 s4, s9, s4 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 @@ -1519,13 +1504,12 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: s_mov_b32 s6, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_lshr_b32 s0, s2, 1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1535,23 +1519,21 @@ ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: s_lshr_b32 s0, s3, 1 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: s_movk_i32 s9, 0xff +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s9, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s9, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_bfe_u32 v2, v1, 8, 8 @@ -1653,13 +1635,12 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: s_mov_b32 s6, 0xffffff ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX8-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX8-NEXT: s_lshr_b32 s0, s2, 1 -; GFX8-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1669,11 +1650,10 @@ ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 ; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -1784,32 +1764,30 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: s_mov_b32 s7, 0xffffff -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: v_and_b32_e32 v3, s7, v3 -; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 -; GFX9-NEXT: v_lshrrev_b32_e64 v3, v3, s2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffff -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 ; GFX9-NEXT: s_lshr_b32 s0, s3, 1 -; GFX9-NEXT: v_and_b32_e32 v2, v3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: s_mov_b32 s6, 8 ; GFX9-NEXT: v_lshl_or_b32 v1, s1, v1, v2 -; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_mov_b32 s8, 16 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -1964,14 +1942,13 @@ ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX6-NEXT: v_mul_lo_u32 v8, v7, v6 ; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v9 ; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX6-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; GFX6-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 @@ -1987,9 +1964,9 @@ ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, v6, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v7, 24 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2001,10 +1978,10 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 23 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v9 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2019,14 +1996,13 @@ ; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX8-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX8-NEXT: v_mul_lo_u32 v8, v7, v6 ; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v8, v9 ; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 @@ -2042,9 +2018,9 @@ ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 ; GFX8-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 23, v4 -; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v4, v6, v9 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v6 ; GFX8-NEXT: v_mul_lo_u32 v6, v7, 24 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 @@ -2056,10 +2032,10 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_bfe_u32 v2, v3, 1, 23 -; GFX8-NEXT: v_and_b32_e32 v3, v4, v9 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2076,15 +2052,14 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; GFX9-NEXT: v_bfe_u32 v2, v2, 1, 23 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX9-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX9-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX9-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v9 ; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_mul_hi_u32 v7, v8, v7 @@ -2098,8 +2073,8 @@ ; GFX9-NEXT: v_add_u32_e32 v6, v8, v7 ; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX9-NEXT: v_sub_u32_e32 v7, 23, v4 -; GFX9-NEXT: v_and_b32_e32 v7, v7, v9 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v7, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v4, v2 @@ -2111,8 +2086,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3308,10 +3283,9 @@ ; GFX9-LABEL: v_fshl_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v2 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 @@ -3445,15 +3419,14 @@ ; ; GFX9-LABEL: v_fshl_v2i16_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v1, s2, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xf000f, v0 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s1, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: s_lshr_b32 s1, s1, 0x10001 ; GFX9-NEXT: s_lshr_b32 s0, s0, 1 -; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xf000f, v0 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 @@ -3939,17 +3912,16 @@ ; GFX9-LABEL: v_fshl_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v6, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xf000f, v4 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v6, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v5 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 @@ -4703,10 +4675,9 @@ ; GFX6-LABEL: v_fshl_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0x7f -; GFX6-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX6-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v14 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v14 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v8 @@ -4752,10 +4723,9 @@ ; GFX8-LABEL: v_fshl_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX8-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v14 ; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v14 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] @@ -4801,10 +4771,9 @@ ; GFX9-LABEL: v_fshl_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX9-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v14 ; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v14 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] @@ -4902,10 +4871,9 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s8, 0x7f -; GFX6-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v6 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v0 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v6 @@ -4955,10 +4923,9 @@ ; ; GFX8-LABEL: v_fshl_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s8, 0x7f -; GFX8-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v6 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] @@ -5008,10 +4975,9 @@ ; ; GFX9-LABEL: v_fshl_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s8, 0x7f -; GFX9-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v6 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] @@ -6020,8 +5986,7 @@ ; GFX6-LABEL: v_fshl_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s6, 0x7f -; GFX6-NEXT: v_and_b32_e32 v23, s6, v16 +; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 @@ -6029,7 +5994,7 @@ ; GFX6-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX6-NEXT: v_lshlrev_b32_e32 v17, 31, v10 -; GFX6-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v17 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 @@ -6064,9 +6029,9 @@ ; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v17, v8 ; GFX6-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX6-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v20 -; GFX6-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v16 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 @@ -6112,8 +6077,7 @@ ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s6, 0x7f -; GFX8-NEXT: v_and_b32_e32 v23, s6, v16 +; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] @@ -6121,7 +6085,7 @@ ; GFX8-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX8-NEXT: v_lshlrev_b32_e32 v17, 31, v10 -; GFX8-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX8-NEXT: v_or_b32_e32 v9, v9, v17 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 @@ -6156,9 +6120,9 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v17, v8 ; GFX8-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX8-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v20 -; GFX8-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v16 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] @@ -6204,8 +6168,7 @@ ; GFX9-LABEL: v_fshl_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s6, 0x7f -; GFX9-NEXT: v_and_b32_e32 v23, s6, v16 +; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] @@ -6213,7 +6176,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 ; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v17, 31, v10 -; GFX9-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX9-NEXT: v_or_b32_e32 v9, v9, v17 ; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 @@ -6248,9 +6211,9 @@ ; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 ; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX9-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v20 -; GFX9-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -10,11 +10,10 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s2, s2, 0x7f -; GFX6-NEXT: s_movk_i32 s3, 0x7f ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_and_b32 s1, s1, 0x7f ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s1, s1, 0x7f ; GFX6-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -28,8 +27,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 6, v0 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshr_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 @@ -41,11 +40,10 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f -; GFX8-NEXT: s_movk_i32 s3, 0x7f ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -59,8 +57,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u16_e32 v1, 6, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -72,11 +70,10 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f -; GFX9-NEXT: s_movk_i32 s3, 0x7f ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -90,8 +87,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u16_e32 v1, 6, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 ; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s1 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 @@ -139,14 +136,13 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 @@ -156,8 +152,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -170,14 +166,13 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 @@ -187,8 +182,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -201,14 +196,13 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, -7, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 @@ -218,8 +212,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -646,10 +640,9 @@ ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 @@ -660,7 +653,7 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -980,43 +973,42 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_and_b32_e32 v11, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, v10, v11 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX6-NEXT: v_and_b32_e32 v10, 7, v7 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v10, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v7 ; GFX6-NEXT: v_xor_b32_e32 v7, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v7, v3 ; GFX6-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX6-NEXT: v_lshrrev_b32_e32 v7, v10, v7 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX6-NEXT: v_and_b32_e32 v7, 7, v8 -; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v7 +; GFX6-NEXT: v_xor_b32_e32 v7, -1, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX6-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 7, v8 +; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff -; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v7, v1 -; GFX6-NEXT: v_xor_b32_e32 v7, -1, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 7, v9 -; GFX6-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v9 +; GFX6-NEXT: v_and_b32_e32 v3, 7, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, v7, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v6 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v3, v6 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1061,12 +1053,11 @@ ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1115,8 +1106,8 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1190,10 +1181,9 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX6-NEXT: s_mov_b32 s3, 0xffffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -1208,8 +1198,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshr_b32_e32 v0, s1, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1222,10 +1212,9 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX8-NEXT: s_mov_b32 s3, 0xffffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -1240,8 +1229,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 @@ -1254,10 +1243,9 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff -; GFX9-NEXT: s_mov_b32 s3, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -1272,8 +1260,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s1 ; GFX9-NEXT: v_lshl_or_b32 v0, s0, v1, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 @@ -1322,12 +1310,11 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 @@ -1337,8 +1324,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1354,12 +1341,11 @@ ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 @@ -1369,8 +1355,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1383,15 +1369,14 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffffe8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 ; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 @@ -1401,8 +1386,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1450,46 +1435,46 @@ ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s10, s0, 0xff +; GFX6-NEXT: s_and_b32 s9, s0, 0xff ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX6-NEXT: s_or_b32 s0, s10, s0 +; GFX6-NEXT: s_or_b32 s0, s9, s0 ; GFX6-NEXT: s_or_b32 s1, s7, s1 ; GFX6-NEXT: s_and_b32 s7, s8, 0xff ; GFX6-NEXT: s_lshr_b32 s8, s2, 16 -; GFX6-NEXT: s_lshr_b32 s10, s2, 24 -; GFX6-NEXT: s_and_b32 s12, s2, 0xff +; GFX6-NEXT: s_lshr_b32 s9, s2, 24 +; GFX6-NEXT: s_and_b32 s11, s2, 0xff ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_and_b32 s8, s8, 0xff -; GFX6-NEXT: s_or_b32 s2, s12, s2 +; GFX6-NEXT: s_or_b32 s2, s11, s2 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshr_b32 s11, s3, 8 +; GFX6-NEXT: s_lshr_b32 s10, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 -; GFX6-NEXT: s_and_b32 s8, s11, 0xff +; GFX6-NEXT: s_and_b32 s8, s10, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: s_or_b32 s3, s10, s3 +; GFX6-NEXT: s_or_b32 s3, s9, s3 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s8 ; GFX6-NEXT: s_lshr_b32 s8, s4, 16 -; GFX6-NEXT: s_lshr_b32 s10, s4, 24 -; GFX6-NEXT: s_and_b32 s12, s4, 0xff +; GFX6-NEXT: s_lshr_b32 s9, s4, 24 +; GFX6-NEXT: s_and_b32 s11, s4, 0xff ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 ; GFX6-NEXT: s_and_b32 s8, s8, 0xff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX6-NEXT: s_or_b32 s4, s12, s4 +; GFX6-NEXT: s_or_b32 s4, s11, s4 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 @@ -1498,7 +1483,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_lshr_b32 s11, s5, 8 +; GFX6-NEXT: s_lshr_b32 s10, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: s_and_b32 s5, s5, 0xff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 @@ -1507,9 +1492,9 @@ ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX6-NEXT: s_and_b32 s8, s11, 0xff +; GFX6-NEXT: s_and_b32 s8, s10, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: s_or_b32 s5, s10, s5 +; GFX6-NEXT: s_or_b32 s5, s9, s5 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 @@ -1523,13 +1508,12 @@ ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX6-NEXT: s_mov_b32 s8, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 ; GFX6-NEXT: s_lshl_b32 s4, s6, 17 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_or_b32 s0, s4, s0 -; GFX6-NEXT: v_and_b32_e32 v2, s8, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 @@ -1542,25 +1526,23 @@ ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 ; GFX6-NEXT: s_lshl_b32 s0, s7, 17 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX6-NEXT: s_movk_i32 s9, 0xff +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s9, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX6-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s9, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_bfe_u32 v2, v1, 8, 8 @@ -1656,14 +1638,13 @@ ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX8-NEXT: s_mov_b32 s8, 0xffffff -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX8-NEXT: s_lshl_b32 s4, s6, 17 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_or_b32 s0, s4, s0 -; GFX8-NEXT: v_and_b32_e32 v2, s8, v3 -; GFX8-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s2 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 @@ -1676,13 +1657,12 @@ ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 ; GFX8-NEXT: s_lshl_b32 s0, s7, 17 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s3 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 @@ -1786,39 +1766,37 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX9-NEXT: s_mov_b32 s10, 0xffffff ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 ; GFX9-NEXT: s_lshl_b32 s4, s7, 17 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: v_and_b32_e32 v0, s10, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX9-NEXT: s_or_b32 s0, s4, s0 -; GFX9-NEXT: v_and_b32_e32 v3, s10, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s2 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, v3, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffff -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_sub_u32_e32 v3, 23, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 ; GFX9-NEXT: s_lshl_b32 s0, s9, 17 ; GFX9-NEXT: s_lshl_b32 s1, s1, 1 -; GFX9-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s3 ; GFX9-NEXT: s_mov_b32 s6, 8 -; GFX9-NEXT: v_lshl_or_b32 v1, s0, v3, v1 -; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: v_lshl_or_b32 v1, s0, v2, v1 ; GFX9-NEXT: s_mov_b32 s8, 16 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -1973,19 +1951,19 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 ; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v9 -; GFX6-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; GFX6-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 @@ -1995,15 +1973,14 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v7, v8 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 -; GFX6-NEXT: v_and_b32_e32 v7, v7, v9 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX6-NEXT: v_mul_hi_u32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v3, v9 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 @@ -2013,8 +1990,8 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2030,19 +2007,19 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v8, v9 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 @@ -2052,15 +2029,14 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-NEXT: v_mul_lo_u32 v6, v7, v8 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 -; GFX8-NEXT: v_and_b32_e32 v7, v7, v9 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX8-NEXT: v_mul_hi_u32 v6, v8, v6 -; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v7, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_and_b32_e32 v3, v3, v9 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 @@ -2070,8 +2046,8 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 -; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2089,19 +2065,18 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX9-NEXT: v_mul_lo_u32 v8, v7, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v9 ; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v9 ; GFX9-NEXT: v_mul_hi_u32 v7, v8, v7 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 ; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 @@ -2114,8 +2089,8 @@ ; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX9-NEXT: v_and_b32_e32 v6, v6, v9 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v6, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v5, v7 @@ -2126,8 +2101,8 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 -; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, v4, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3215,10 +3190,9 @@ ; GFX9-LABEL: v_fshr_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v2 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 @@ -3381,13 +3355,12 @@ ; ; GFX9-LABEL: v_fshr_v2i16_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v1, s2, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: v_and_b32_e32 v1, 0xf000f, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 ; GFX9-NEXT: s_lshl_b32 s2, s2, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xf000f, v0 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s1 @@ -3894,12 +3867,11 @@ ; GFX6-LABEL: v_fshr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX6-NEXT: v_and_b32_e32 v8, v8, v12 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_or_b32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; GFX6-NEXT: v_and_b32_e32 v10, v10, v12 +; GFX6-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 ; GFX6-NEXT: v_bfe_u32 v10, v4, 1, 15 @@ -4029,17 +4001,16 @@ ; GFX9-LABEL: v_fshr_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v6, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v6, 0xf000f, v4 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v4, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v6, v2 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v5 -; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v4, v1 ; GFX9-NEXT: v_pk_lshrrev_b16 v2, v2, v3 @@ -4820,11 +4791,10 @@ ; GFX6-LABEL: v_fshr_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0x7f -; GFX6-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 @@ -4869,11 +4839,10 @@ ; GFX8-LABEL: v_fshr_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7f -; GFX8-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 @@ -4918,11 +4887,10 @@ ; GFX9-LABEL: v_fshr_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7f -; GFX9-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 @@ -5019,11 +4987,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshr_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_movk_i32 s8, 0x7f -; GFX6-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_mov_b32 s9, 0 -; GFX6-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s8, s1, 31 ; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 @@ -5072,11 +5039,10 @@ ; ; GFX8-LABEL: v_fshr_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s8, 0x7f -; GFX8-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_mov_b32 s9, 0 -; GFX8-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s8, s1, 31 ; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 @@ -5125,11 +5091,10 @@ ; ; GFX9-LABEL: v_fshr_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s8, 0x7f -; GFX9-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: s_mov_b32 s9, 0 -; GFX9-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s8, s1, 31 ; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 @@ -6143,17 +6108,16 @@ ; GFX6-LABEL: v_fshr_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s6, 0x7f ; GFX6-NEXT: v_xor_b32_e32 v17, -1, v16 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v23, s6, v17 +; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v17 ; GFX6-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 -; GFX6-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v24 ; GFX6-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX6-NEXT: v_or_b32_e32 v22, v18, v22 @@ -6189,7 +6153,7 @@ ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX6-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX6-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX6-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 @@ -6202,7 +6166,7 @@ ; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v17 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v18 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX6-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX6-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc @@ -6235,17 +6199,16 @@ ; GFX8-LABEL: v_fshr_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s6, 0x7f ; GFX8-NEXT: v_xor_b32_e32 v17, -1, v16 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v23, s6, v17 +; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v17 ; GFX8-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v24 ; GFX8-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX8-NEXT: v_or_b32_e32 v22, v18, v22 @@ -6281,7 +6244,7 @@ ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX8-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX8-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 @@ -6294,7 +6257,7 @@ ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v17, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[8:9], v18, v[8:9] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX8-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX8-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc @@ -6327,17 +6290,16 @@ ; GFX9-LABEL: v_fshr_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s6, 0x7f ; GFX9-NEXT: v_xor_b32_e32 v17, -1, v16 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v23, s6, v17 +; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v2, v2, v17 ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v24, s6, v16 +; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 ; GFX9-NEXT: v_sub_u32_e32 v16, 64, v24 ; GFX9-NEXT: v_or_b32_e32 v21, v17, v21 ; GFX9-NEXT: v_or_b32_e32 v22, v18, v22 @@ -6373,7 +6335,7 @@ ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 ; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 -; GFX9-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX9-NEXT: v_and_b32_e32 v17, 0x7f, v8 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 @@ -6386,7 +6348,7 @@ ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v17, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], v18, v[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 -; GFX9-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX9-NEXT: v_and_b32_e32 v16, 0x7f, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/hip.extern.shared.array.ll @@ -82,9 +82,8 @@ ; The offset to the dynamic shared memory array should be aligned on the ; maximal one. ; CHECK-LABEL: {{^}}dynamic_shared_array_4: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} -; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] +; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x48, [[IDX]] define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() %vidx = add i32 %tid.x, %idx @@ -101,9 +100,8 @@ ; Honor the explicit alignment from the specified variable. ; CHECK-LABEL: {{^}}dynamic_shared_array_5: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} -; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] +; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x44, [[IDX]] define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() %vidx = add i32 %tid.x, %idx @@ -120,9 +118,8 @@ ; Honor the explicit alignment from the specified variable. ; CHECK-LABEL: {{^}}dynamic_shared_array_6: -; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}} -; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]] +; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x50, [[IDX]] define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() %vidx = add i32 %tid.x, %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -332,11 +332,10 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v1 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 @@ -547,12 +546,11 @@ ; GFX7-LABEL: insertelement_v_v2i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 @@ -1013,15 +1011,14 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 @@ -1306,13 +1303,12 @@ ; GFX7-LABEL: insertelement_v_v4i16_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 @@ -2001,20 +1997,19 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s8, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 @@ -2380,15 +2375,14 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 @@ -3486,17 +3480,16 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX7-NEXT: s_mov_b32 s20, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v9, s18 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s20, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mov_b32_e32 v10, s19 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s20, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 @@ -4068,19 +4061,18 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -51,17 +51,16 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -133,17 +132,16 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -217,15 +215,14 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -301,17 +298,16 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -386,15 +382,14 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xff ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -466,17 +461,16 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -546,15 +540,14 @@ ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -624,15 +617,14 @@ ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -868,10 +860,10 @@ ; GFX9-LABEL: insertelement_s_v4i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_movk_i32 s5, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s1, 8 ; GFX9-NEXT: s_mov_b32 s2, 16 +; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 @@ -938,8 +930,7 @@ ; GFX7-LABEL: insertelement_s_v4i8_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 @@ -959,7 +950,7 @@ ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1088,7 +1079,6 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 @@ -1103,13 +1093,13 @@ ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_and_b32 s1, s4, 0xff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1236,10 +1226,10 @@ ; GFX7-LABEL: insertelement_s_v4i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 @@ -1250,15 +1240,14 @@ ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s2, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 24 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1387,37 +1376,36 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: v_and_b32_e32 v1, 3, v2 ; GFX7-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v1 -; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -1533,18 +1521,17 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_and_b32 s1, s2, 3 -; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, s1, v1 -; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 +; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v4, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -1552,7 +1539,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 @@ -1678,38 +1665,36 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_movk_i32 s2, 0xff -; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_mov_b32_e32 v1, 0xff -; GFX7-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v1 -; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v5, s2, v0 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v1, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2131,64 +2116,62 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_lshr_b32 s1, s3, 2 -; GFX7-NEXT: s_and_b32 s3, s3, 3 +; GFX7-NEXT: s_and_b32 s1, s3, 3 +; GFX7-NEXT: s_lshr_b32 s0, s3, 2 ; GFX7-NEXT: s_and_b32 s2, s2, 0xff -; GFX7-NEXT: s_lshl_b32 s3, s3, 3 -; GFX7-NEXT: s_lshl_b32 s2, s2, s3 -; GFX7-NEXT: s_lshl_b32 s3, 0xff, s3 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_not_b32 s3, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, s0, v0 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v7, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s3, v3 -; GFX7-NEXT: v_or_b32_e32 v3, s2, v3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -2373,7 +2356,6 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 @@ -2403,32 +2385,32 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4 -; GFX7-NEXT: v_or_b32_e32 v3, s3, v0 +; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_bfe_u32 v3, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2631,36 +2613,35 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_and_b32 s6, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_and_b32 s5, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s6, s6, s7 +; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s6, s0 +; GFX7-NEXT: s_or_b32 s0, s5, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s6, s1, 0x80008 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s1, 0xff -; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_and_b32 s2, s4, 0xff ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 @@ -2670,7 +2651,7 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -2887,37 +2868,36 @@ ; GFX7-LABEL: insertelement_s_v8i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_and_b32 s5, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_and_b32 s4, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s5, s5, s6 +; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s5, s0 +; GFX7-NEXT: s_or_b32 s0, s4, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX7-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_and_b32 s2, s1, 0xff -; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_lshl_b32 s4, s4, 8 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX7-NEXT: s_or_b32 s2, s2, s5 +; GFX7-NEXT: s_or_b32 s2, s2, s4 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 @@ -2928,7 +2908,7 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -3131,64 +3111,62 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, 0xff -; GFX7-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v5, s1, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v0 +; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v10, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3360,64 +3338,62 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v3, 0xff -; GFX7-NEXT: s_lshr_b32 s1, s2, 2 -; GFX7-NEXT: s_and_b32 s2, s2, 3 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX7-NEXT: s_lshl_b32 s2, s2, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 -; GFX7-NEXT: s_lshl_b32 s2, 0xff, s2 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_not_b32 s2, s2 +; GFX7-NEXT: s_and_b32 s1, s2, 3 +; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v6, s0, v0 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v7, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, v0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3588,62 +3564,60 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v4, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v0 +; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v10, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v6, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v4, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -4349,109 +4323,107 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v4, 0xff -; GFX7-NEXT: s_and_b32 s1, s3, 3 +; GFX7-NEXT: s_and_b32 s0, s3, 3 ; GFX7-NEXT: s_lshr_b32 s4, s3, 2 -; GFX7-NEXT: s_and_b32 s2, s2, 0xff -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: s_lshl_b32 s5, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s6, s1 +; GFX7-NEXT: s_not_b32 s6, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v9, s0, v0 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 +; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v16, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v15, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v3, v12, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 -; GFX7-NEXT: v_or_b32_e32 v5, s5, v5 +; GFX7-NEXT: v_or_b32_e32 v11, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v11, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v1 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -4757,7 +4729,6 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s0, 24 @@ -4811,58 +4782,58 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX7-NEXT: s_andn2_b32 s4, s6, s4 -; GFX7-NEXT: v_or_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_and_b32_e32 v5, v1, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_and_b32_e32 v5, v2, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -5256,56 +5227,55 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, 0xff -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v1, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v2, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -5638,26 +5608,26 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s4, s0, 24 -; GFX7-NEXT: s_and_b32 s9, s0, 0xff -; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_and_b32 s8, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s9, s9, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX7-NEXT: s_or_b32 s9, s9, s10 +; GFX7-NEXT: s_or_b32 s8, s8, s9 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s9, s0 +; GFX7-NEXT: s_or_b32 s0, s8, s0 ; GFX7-NEXT: s_lshl_b32 s4, s4, 24 -; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 +; GFX7-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s1, 24 ; GFX7-NEXT: s_or_b32 s4, s0, s4 ; GFX7-NEXT: s_and_b32 s0, s1, 0xff -; GFX7-NEXT: s_lshl_b32 s9, s9, 8 +; GFX7-NEXT: s_lshl_b32 s8, s8, 8 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX7-NEXT: s_or_b32 s0, s0, s9 +; GFX7-NEXT: s_or_b32 s0, s0, s8 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s5, 24 @@ -5687,66 +5657,64 @@ ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_mov_b32_e32 v6, s7 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v6, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v5, 0xff -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v1, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v2, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, v3, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -6038,109 +6006,107 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v7, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v18, 2, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX7-NEXT: v_lshl_b32_e32 v18, s0, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v17 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v17 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 +; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v3 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v12, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v14, v5, v7 +; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v17, v6, 8, 8 +; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v16, v6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v6 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 -; GFX7-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 +; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshl_b32_e32 v19, s0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v18 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v18 -; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v19 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v18 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v7 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v11, v1, v7 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v13, v3, v7 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -6407,109 +6373,107 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v7, 0xff +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v2 +; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: s_lshr_b32 s4, s2, 2 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v7 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v10, s0, v3 +; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v4 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v12, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v14, v5, v7 +; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v17, v6, 8, 8 -; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v16, v6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v6 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: s_not_b32 s5, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 +; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 +; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc +; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v5, s5, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v7 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v11, v1, v7 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v2, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v13, v3, v7 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -6775,103 +6739,101 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_mov_b32_e32 v8, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v19, 2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 2, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v18 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v8 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v19 +; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v18 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 -; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v4, 8, 8 +; GFX7-NEXT: v_bfe_u32 v13, v5, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v4 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v11, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v13, s0, v5 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v5 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v15, v6, v8 +; GFX7-NEXT: v_bfe_u32 v15, v6, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v6 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 -; GFX7-NEXT: v_bfe_u32 v18, v7, 8, 8 +; GFX7-NEXT: v_bfe_u32 v17, v7, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v12, v13, v14 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v7 -; GFX7-NEXT: v_and_b32_e32 v17, v7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v7 +; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v7 ; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX7-NEXT: v_or_b32_e32 v13, v15, v16 -; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 -; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX7-NEXT: v_or_b32_e32 v14, v17, v18 -; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX7-NEXT: v_or_b32_e32 v7, v14, v7 -; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v7, v13, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 -; GFX7-NEXT: v_or_b32_e32 v5, v7, v10 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, v2, s[0:1] +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v5, v2, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v9, v0, v8 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v11, v1, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v13, v3, v8 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -585,10 +585,9 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm d16 -; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0xffff ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -46,14 +46,14 @@ ; GFX906-NEXT: s_movk_i32 s4, 0xff ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX906-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX906-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v6 -; GFX906-NEXT: v_and_b32_e32 v3, s4, v7 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7 ; GFX906-NEXT: v_and_or_b32 v1, v4, s4, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -46,14 +46,14 @@ ; GFX906-NEXT: s_movk_i32 s4, 0xff ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX906-NEXT: v_and_or_b32 v0, v0, s4, v1 -; GFX906-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX906-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX906-NEXT: v_and_b32_e32 v2, s4, v6 -; GFX906-NEXT: v_and_b32_e32 v3, s4, v7 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7 ; GFX906-NEXT: v_and_or_b32 v1, v4, s4, v1 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -8,9 +8,8 @@ ; GFX6-LABEL: v_lshr_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -105,9 +104,8 @@ ; GCN-LABEL: v_lshr_i24: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0xffffff -; GCN-NEXT: v_and_b32_e32 v1, s4, v1 -; GCN-NEXT: v_and_b32_e32 v0, s4, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -550,9 +548,8 @@ ; GFX6-LABEL: v_lshr_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -681,12 +678,11 @@ ; GFX6-LABEL: v_lshr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -797,11 +793,10 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: lshr_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -834,12 +829,11 @@ define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: lshr_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -883,18 +877,17 @@ ; GFX6-LABEL: v_lshr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1032,31 +1025,29 @@ ; GFX6-LABEL: v_lshr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 -; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v8, v15, v16 -; GFX6-NEXT: v_and_b32_e32 v7, v7, v16 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, v8, v7 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -38,9 +38,8 @@ ; GFX7-LABEL: v_mul_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -169,9 +168,8 @@ ; GFX7-LABEL: v_mul_i16_signext: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll @@ -94,9 +94,8 @@ ; GFX8-LABEL: v_mul_v2i16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x80008000 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX8-NEXT: v_mul_lo_u16_e32 v2, v0, v1 ; GFX8-NEXT: v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -559,12 +559,11 @@ ; GFX6-LABEL: v_orn2_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -816,18 +815,17 @@ ; GFX6-LABEL: v_orn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v8 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v3, v4, v8 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX6-NEXT: v_and_b32_e32 v4, v6, v8 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -608,19 +608,17 @@ ; GFX6-LABEL: v_roundeven_v2f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_and_b32_e32 v5, s6, v1 -; GFX6-NEXT: s_mov_b32 s7, 0x43300000 +; GFX6-NEXT: v_and_b32_e32 v5, 0x80000000, v1 ; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 +; GFX6-NEXT: v_or_b32_e32 v5, 0x43300000, v5 ; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, 0x432fffff ; GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] ; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; GFX6-NEXT: v_and_b32_e32 v5, s6, v3 -; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 0x80000000, v3 +; GFX6-NEXT: v_or_b32_e32 v5, 0x43300000, v5 ; GFX6-NEXT: v_add_f64 v[7:8], v[2:3], v[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; GFX6-NEXT: v_add_f64 v[4:5], v[7:8], -v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -259,10 +259,9 @@ ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -273,21 +272,19 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v4, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 -; GFX8-NEXT: v_sub_u16_e32 v4, s4, v4 +; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_max_i16_e32 v1, v5, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 -; GFX8-NEXT: v_sub_u16_e32 v1, s4, v1 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 ; GFX8-NEXT: v_max_i16_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 @@ -512,22 +509,21 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -540,33 +536,30 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v10, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v9, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v8, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v10, s5, v10 -; GFX8-NEXT: v_sub_u16_e32 v8, s4, v8 -; GFX8-NEXT: v_max_i16_e32 v1, v10, v1 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_max_i16_e32 v1, v9, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 ; GFX8-NEXT: v_min_i16_e32 v8, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_sub_u16_e32 v1, s4, v1 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 +; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 ; GFX8-NEXT: v_max_i16_e32 v2, v8, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 ; GFX8-NEXT: v_min_i16_e32 v6, 0, v2 -; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v6, s5, v6 -; GFX8-NEXT: v_sub_u16_e32 v4, v9, v4 +; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_max_i16_e32 v3, v6, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v3 @@ -575,7 +568,7 @@ ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 ; GFX8-NEXT: v_max_i16_e32 v5, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6 -; GFX8-NEXT: v_sub_u16_e32 v5, v9, v5 +; GFX8-NEXT: v_sub_u16_e32 v5, 0x7fff, v5 ; GFX8-NEXT: v_max_i16_e32 v4, v6, v4 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 @@ -619,7 +612,7 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -853,7 +846,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -2691,19 +2684,17 @@ ; GFX8-LABEL: v_saddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, s4, v3 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3 ; GFX8-NEXT: v_max_i16_e32 v4, v4, v1 ; GFX8-NEXT: v_min_i16_e32 v5, 0, v2 ; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 -; GFX8-NEXT: v_sub_u16_e32 v4, s4, v4 +; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_max_i16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v3 @@ -2830,10 +2821,9 @@ ; GFX6-NEXT: v_min_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -2901,30 +2891,27 @@ ; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: saddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s3, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: s_movk_i32 s2, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v3, s3, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, s2, v2 +; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_max_i16_e32 v3, s0, v3 ; GFX8-NEXT: v_min_i16_e32 v4, 0, v1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 ; GFX8-NEXT: v_max_i16_e32 v3, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v4, s3, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, s2, v3 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3 ; GFX8-NEXT: v_max_i16_e32 v4, s1, v4 ; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 @@ -3001,17 +2988,16 @@ ; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3019,32 +3005,30 @@ ; GFX8-LABEL: v_saddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 ; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v7, s5, v7 -; GFX8-NEXT: v_sub_u16_e32 v6, s4, v6 +; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v2 ; GFX8-NEXT: v_min_i16_e32 v8, 0, v4 ; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 ; GFX8-NEXT: v_max_i16_e32 v7, 0, v4 -; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 +; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7 ; GFX8-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v8, 0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 ; GFX8-NEXT: v_max_i16_e32 v7, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 -; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 +; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7 ; GFX8-NEXT: v_max_i16_e32 v8, v8, v3 ; GFX8-NEXT: v_min_i16_e32 v9, 0, v5 ; GFX8-NEXT: v_min_i16_e32 v7, v8, v7 ; GFX8-NEXT: v_max_i16_e32 v8, 0, v5 -; GFX8-NEXT: v_sub_u16_e32 v9, s5, v9 -; GFX8-NEXT: v_sub_u16_e32 v8, s4, v8 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 ; GFX8-NEXT: v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v8 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v6 @@ -3286,24 +3270,23 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 ; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3311,56 +3294,52 @@ ; GFX8-LABEL: v_saddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v11, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v10, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v9, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v11, s5, v11 -; GFX8-NEXT: v_sub_u16_e32 v9, s4, v9 -; GFX8-NEXT: v_max_i16_e32 v11, v11, v3 -; GFX8-NEXT: v_min_i16_e32 v13, 0, v6 -; GFX8-NEXT: v_min_i16_e32 v9, v11, v9 -; GFX8-NEXT: v_max_i16_e32 v11, 0, v6 -; GFX8-NEXT: v_sub_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 -; GFX8-NEXT: v_max_i16_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v13, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v10, 0x8000, v10 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_max_i16_e32 v10, v10, v3 +; GFX8-NEXT: v_min_i16_e32 v11, 0, v6 +; GFX8-NEXT: v_min_i16_e32 v9, v10, v9 +; GFX8-NEXT: v_max_i16_e32 v10, 0, v6 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v11, 0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v11 -; GFX8-NEXT: v_max_i16_e32 v11, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v7 -; GFX8-NEXT: v_min_i16_e32 v11, v13, v11 -; GFX8-NEXT: v_max_i16_e32 v13, 0, v7 -; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 -; GFX8-NEXT: v_sub_u16_e32 v13, s4, v13 -; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff -; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 -; GFX8-NEXT: v_max_i16_e32 v13, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v14, v12, v14 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v10 +; GFX8-NEXT: v_max_i16_e32 v10, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v4 +; GFX8-NEXT: v_min_i16_e32 v12, 0, v7 +; GFX8-NEXT: v_min_i16_e32 v10, v11, v10 +; GFX8-NEXT: v_max_i16_e32 v11, 0, v7 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_max_i16_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v12, 0, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v13, v10, v13 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 -; GFX8-NEXT: v_min_i16_e32 v13, v14, v13 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v10, v10, v14 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v12, v12, v14 -; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v11 +; GFX8-NEXT: v_max_i16_e32 v11, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_max_i16_e32 v12, v12, v5 +; GFX8-NEXT: v_min_i16_e32 v13, 0, v8 +; GFX8-NEXT: v_min_i16_e32 v11, v12, v11 +; GFX8-NEXT: v_max_i16_e32 v12, 0, v8 +; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_max_i16_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 ; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v5, v5, v10 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v11 +; GFX8-NEXT: v_add_u16_e32 v1, v1, v10 ; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v13 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v11 ; GFX8-NEXT: v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3662,32 +3641,31 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 ; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3695,72 +3673,68 @@ ; GFX8-LABEL: v_saddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v13, 0, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v12, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_sub_u16_e32 v12, s4, v12 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v4 -; GFX8-NEXT: v_min_i16_e32 v16, 0, v8 -; GFX8-NEXT: v_min_i16_e32 v12, v14, v12 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 -; GFX8-NEXT: v_max_i16_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v16, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 +; GFX8-NEXT: v_min_i16_e32 v14, 0, v8 +; GFX8-NEXT: v_min_i16_e32 v12, v13, v12 +; GFX8-NEXT: v_max_i16_e32 v13, 0, v8 +; GFX8-NEXT: v_sub_u16_e32 v14, 0x8000, v14 +; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13 +; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v14, 0, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 -; GFX8-NEXT: v_max_i16_e32 v16, v16, v5 -; GFX8-NEXT: v_min_i16_e32 v17, 0, v9 -; GFX8-NEXT: v_min_i16_e32 v14, v16, v14 -; GFX8-NEXT: v_max_i16_e32 v16, 0, v9 -; GFX8-NEXT: v_sub_u16_e32 v17, s5, v17 -; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 -; GFX8-NEXT: v_sub_u16_e32 v16, s4, v16 -; GFX8-NEXT: v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v17, 0, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 +; GFX8-NEXT: v_max_i16_e32 v13, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v14, 0x8000, v14 +; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13 +; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 +; GFX8-NEXT: v_min_i16_e32 v15, 0, v9 +; GFX8-NEXT: v_min_i16_e32 v13, v14, v13 +; GFX8-NEXT: v_max_i16_e32 v14, 0, v9 +; GFX8-NEXT: v_sub_u16_e32 v15, 0x8000, v15 +; GFX8-NEXT: v_sub_u16_e32 v14, 0x7fff, v14 +; GFX8-NEXT: v_max_i16_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v15, 0, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff -; GFX8-NEXT: v_min_i16_e32 v5, v5, v16 -; GFX8-NEXT: v_max_i16_e32 v16, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v17, v15, v17 -; GFX8-NEXT: v_sub_u16_e32 v16, v13, v16 -; GFX8-NEXT: v_max_i16_e32 v17, v17, v6 -; GFX8-NEXT: v_min_i16_e32 v18, 0, v10 -; GFX8-NEXT: v_min_i16_e32 v16, v17, v16 -; GFX8-NEXT: v_max_i16_e32 v17, 0, v10 -; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 -; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 -; GFX8-NEXT: v_max_i16_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v18, 0, v3 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v17 -; GFX8-NEXT: v_max_i16_e32 v17, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v14 +; GFX8-NEXT: v_max_i16_e32 v14, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v15, 0x8000, v15 +; GFX8-NEXT: v_sub_u16_e32 v14, 0x7fff, v14 +; GFX8-NEXT: v_max_i16_e32 v15, v15, v6 +; GFX8-NEXT: v_min_i16_e32 v16, 0, v10 +; GFX8-NEXT: v_min_i16_e32 v14, v15, v14 +; GFX8-NEXT: v_max_i16_e32 v15, 0, v10 +; GFX8-NEXT: v_sub_u16_e32 v16, 0x8000, v16 +; GFX8-NEXT: v_sub_u16_e32 v15, 0x7fff, v15 +; GFX8-NEXT: v_max_i16_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v16, 0, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 -; GFX8-NEXT: v_max_i16_e32 v18, v18, v7 -; GFX8-NEXT: v_min_i16_e32 v17, v18, v17 -; GFX8-NEXT: v_max_i16_e32 v18, 0, v11 -; GFX8-NEXT: v_sub_u16_e32 v13, v13, v18 -; GFX8-NEXT: v_min_i16_e32 v18, 0, v11 -; GFX8-NEXT: v_sub_u16_e32 v15, v15, v18 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v15 +; GFX8-NEXT: v_max_i16_e32 v15, 0, v3 +; GFX8-NEXT: v_sub_u16_e32 v16, 0x8000, v16 +; GFX8-NEXT: v_sub_u16_e32 v15, 0x7fff, v15 +; GFX8-NEXT: v_max_i16_e32 v16, v16, v7 +; GFX8-NEXT: v_min_i16_e32 v17, 0, v11 +; GFX8-NEXT: v_min_i16_e32 v15, v16, v15 +; GFX8-NEXT: v_max_i16_e32 v16, 0, v11 +; GFX8-NEXT: v_sub_u16_e32 v17, 0x8000, v17 ; GFX8-NEXT: v_add_u16_e32 v0, v0, v12 ; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_i16_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v16, 0x7fff, v16 +; GFX8-NEXT: v_max_i16_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v14 +; GFX8-NEXT: v_add_u16_e32 v1, v1, v13 ; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v7, v7, v13 +; GFX8-NEXT: v_min_i16_e32 v7, v7, v16 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v16 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v14 ; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v17 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v15 ; GFX8-NEXT: v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -615,20 +615,19 @@ ; GISEL-LABEL: v_sdiv_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 +; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_xor_b32_e32 v4, v4, v6 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 @@ -679,73 +678,72 @@ ; CGP-LABEL: v_sdiv_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 +; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_xor_b32_e32 v4, v4, v6 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v4, v6 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v7 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_xor_b32_e32 v4, v5, v7 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 +; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 ; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; CGP-NEXT: v_rcp_f32_e32 v6, v6 -; CGP-NEXT: v_rcp_f32_e32 v8, v8 -; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v9, v9, v8 -; CGP-NEXT: v_mul_lo_u32 v10, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_rcp_f32_e32 v5, v5 +; CGP-NEXT: v_rcp_f32_e32 v7, v7 +; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v9, v7 +; CGP-NEXT: v_mul_lo_u32 v10, 0, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v5, v6 ; CGP-NEXT: v_mul_lo_u32 v11, 0, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v7, v9 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9 -; CGP-NEXT: v_mul_lo_u32 v8, 0, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v9, 0, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v1, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_mul_lo_u32 v8, v6, v2 -; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v6 -; CGP-NEXT: v_mul_lo_u32 v10, v7, v3 -; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v7 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v7, 0, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v0, v5 +; CGP-NEXT: v_mul_lo_u32 v9, 0, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v1, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_mul_lo_u32 v7, v5, v2 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; CGP-NEXT: v_mul_lo_u32 v10, v6, v3 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v6 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5] ; CGP-NEXT: v_sub_i32_e64 v9, s[6:7], v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v6 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; CGP-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v8 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i32> , %y %r = sdiv <2 x i32> %x, %shl.y @@ -756,9 +754,8 @@ ; GISEL-LABEL: v_sdiv_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -792,9 +789,8 @@ ; CGP-LABEL: v_sdiv_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 @@ -829,11 +825,10 @@ ; GISEL-LABEL: v_sdiv_v2i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 -; GISEL-NEXT: v_and_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_and_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 @@ -895,11 +890,10 @@ ; CGP-LABEL: v_sdiv_v2i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 -; CGP-NEXT: v_and_b32_e32 v3, s4, v3 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -3325,11 +3325,10 @@ ; GISEL-LABEL: v_sdiv_i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v1, s4, v2 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -3353,10 +3352,9 @@ ; CGP-LABEL: v_sdiv_i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v1, s4, v2 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; CGP-NEXT: v_cvt_f32_i32_e32 v0, v0 ; CGP-NEXT: v_rcp_f32_e32 v2, v1 ; CGP-NEXT: v_mul_f32_e32 v2, v0, v2 @@ -3379,8 +3377,7 @@ ; GISEL-LABEL: v_sdiv_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s6, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v1, s6, v4 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 @@ -3389,8 +3386,8 @@ ; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_and_b32_e32 v5, s6, v0 -; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 +; GISEL-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v4, v4 @@ -3405,7 +3402,7 @@ ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v12 ; GISEL-NEXT: v_mul_lo_u32 v12, v4, v11 ; GISEL-NEXT: v_mul_lo_u32 v13, v7, v10 -; GISEL-NEXT: v_and_b32_e32 v0, s6, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 ; GISEL-NEXT: v_mul_hi_u32 v2, v7, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 @@ -3651,15 +3648,14 @@ ; CGP-LABEL: v_sdiv_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v1, s4, v4 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; CGP-NEXT: v_cvt_f32_i32_e32 v1, v1 -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; CGP-NEXT: v_cvt_f32_i32_e32 v0, v0 -; CGP-NEXT: v_and_b32_e32 v4, s4, v6 +; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6 ; CGP-NEXT: v_rcp_f32_e32 v3, v1 ; CGP-NEXT: v_cvt_f32_i32_e32 v4, v4 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_i32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v3, v0, v3 ; CGP-NEXT: v_trunc_f32_e32 v3, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -906,9 +906,8 @@ ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f7ffffe +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s12, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s12 @@ -920,136 +919,135 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_xor_b32 s16, s0, s15 ; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s16 -; GFX8-NEXT: s_ashr_i32 s14, s4, 31 +; GFX8-NEXT: s_ashr_i32 s14, s8, 31 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s4, s14 +; GFX8-NEXT: s_add_i32 s0, s8, s14 ; GFX8-NEXT: s_xor_b32 s0, s0, s14 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX8-NEXT: s_ashr_i32 s4, s5, 31 +; GFX8-NEXT: s_ashr_i32 s8, s9, 31 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, v2, v3 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s13 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s13, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s13, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s13, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s13, v2 ; GFX8-NEXT: s_sub_i32 s0, 0, s16 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_mul_lo_u32 v4, s0, v1 -; GFX8-NEXT: s_add_i32 s1, s5, s4 -; GFX8-NEXT: s_xor_b32 s1, s1, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX8-NEXT: s_add_i32 s1, s9, s8 +; GFX8-NEXT: s_xor_b32 s1, s1, s8 ; GFX8-NEXT: s_xor_b32 s0, s14, s12 -; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_xor_b32_e32 v2, s14, v2 -; GFX8-NEXT: s_ashr_i32 s5, s2, 31 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; GFX8-NEXT: s_ashr_i32 s9, s2, 31 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s14, v2 -; GFX8-NEXT: v_mul_lo_u32 v5, v1, s16 -; GFX8-NEXT: s_add_i32 s0, s2, s5 -; GFX8-NEXT: s_xor_b32 s2, s0, s5 -; GFX8-NEXT: s_ashr_i32 s12, s6, 31 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v5 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s16 +; GFX8-NEXT: s_add_i32 s0, s2, s9 +; GFX8-NEXT: s_xor_b32 s2, s0, s9 +; GFX8-NEXT: s_ashr_i32 s12, s10, 31 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s16, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 -; GFX8-NEXT: v_mul_f32_e32 v5, v5, v3 -; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s16, v2 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s16, v2 ; GFX8-NEXT: s_sub_i32 s0, 0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, s0, v5 -; GFX8-NEXT: s_add_i32 s1, s6, s12 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_mul_lo_u32 v5, s0, v3 +; GFX8-NEXT: s_add_i32 s1, s10, s12 ; GFX8-NEXT: s_xor_b32 s1, s1, s12 -; GFX8-NEXT: s_xor_b32 s0, s4, s15 -; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: s_xor_b32 s0, s8, s15 +; GFX8-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX8-NEXT: v_xor_b32_e32 v2, s8, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6 -; GFX8-NEXT: v_mul_hi_u32 v6, s1, v5 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v2 -; GFX8-NEXT: s_ashr_i32 s4, s3, 31 -; GFX8-NEXT: v_mul_lo_u32 v7, v6, s2 -; GFX8-NEXT: s_add_i32 s0, s3, s4 -; GFX8-NEXT: s_xor_b32 s3, s0, s4 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v7 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v6 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v2 +; GFX8-NEXT: s_ashr_i32 s8, s3, 31 +; GFX8-NEXT: v_mul_lo_u32 v6, v3, s2 +; GFX8-NEXT: s_add_i32 s0, s3, s8 +; GFX8-NEXT: s_xor_b32 s3, s0, s8 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v3 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v7, s3 -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX8-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s3 +; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v3 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s2, v2 ; GFX8-NEXT: s_sub_i32 s0, 0, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, s0, v3 -; GFX8-NEXT: s_ashr_i32 s2, s7, 31 -; GFX8-NEXT: s_add_i32 s1, s7, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GFX8-NEXT: v_mul_hi_u32 v2, v3, v2 +; GFX8-NEXT: v_mul_lo_u32 v2, s0, v6 +; GFX8-NEXT: s_ashr_i32 s2, s11, 31 +; GFX8-NEXT: s_add_i32 s1, s11, s2 ; GFX8-NEXT: s_xor_b32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s0, s12, s5 -; GFX8-NEXT: v_xor_b32_e32 v6, s0, v6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_hi_u32 v3, s1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, s12, v7 -; GFX8-NEXT: v_mul_lo_u32 v7, v3, s3 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s12, v6 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v3 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s1, v7 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v7 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v2 +; GFX8-NEXT: s_xor_b32 s0, s12, s9 +; GFX8-NEXT: v_xor_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; GFX8-NEXT: v_mul_hi_u32 v8, s1, v2 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s12, v7 +; GFX8-NEXT: v_mul_lo_u32 v7, v8, s3 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s12, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v7 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v8 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s3, v7 -; GFX8-NEXT: s_xor_b32 s0, s2, s4 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX8-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc +; GFX8-NEXT: s_xor_b32 s0, s2, s8 +; GFX8-NEXT: v_xor_b32_e32 v3, s0, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, s2, v8 +; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, s9 -; GFX8-NEXT: v_xor_b32_e32 v7, s2, v7 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s2, v7 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f7ffffe ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s6, s12, 31 ; GFX9-NEXT: s_add_i32 s0, s12, s6 @@ -1057,119 +1055,118 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; GFX9-NEXT: s_ashr_i32 s4, s13, 31 -; GFX9-NEXT: s_add_i32 s5, s13, s4 +; GFX9-NEXT: s_ashr_i32 s5, s13, 31 +; GFX9-NEXT: s_add_i32 s12, s13, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s12, 0, s7 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_xor_b32 s12, s12, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12 +; GFX9-NEXT: s_sub_i32 s13, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s13, 0, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s12, s8, 31 -; GFX9-NEXT: s_add_i32 s8, s8, s12 -; GFX9-NEXT: s_xor_b32 s8, s8, s12 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_ashr_i32 s4, s8, 31 +; GFX9-NEXT: s_add_i32 s8, s8, s4 +; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s6, s12, s6 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s13, v1 +; GFX9-NEXT: s_xor_b32 s8, s8, s4 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: s_sub_i32 s16, 0, s12 ; GFX9-NEXT: s_ashr_i32 s13, s9, 31 ; GFX9-NEXT: s_add_i32 s9, s9, s13 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s7 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: s_xor_b32 s4, s13, s4 -; GFX9-NEXT: v_sub_u32_e32 v4, s8, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v4 -; GFX9-NEXT: s_xor_b32 s7, s9, s13 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s5 -; GFX9-NEXT: v_xor_b32_e32 v3, s12, v3 -; GFX9-NEXT: s_ashr_i32 s6, s14, 31 -; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3 -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v5 -; GFX9-NEXT: s_add_i32 s7, s14, s6 -; GFX9-NEXT: s_xor_b32 s7, s7, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s7 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 -; GFX9-NEXT: s_sub_i32 s8, 0, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, s8, v5 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: s_ashr_i32 s4, s15, 31 -; GFX9-NEXT: s_add_i32 s9, s15, s4 -; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX9-NEXT: s_xor_b32 s9, s9, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s9 -; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v3 -; GFX9-NEXT: s_ashr_i32 s5, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s5 -; GFX9-NEXT: s_xor_b32 s8, s8, s5 -; GFX9-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, s8, v5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s13, v3 -; GFX9-NEXT: v_mul_lo_u32 v7, v6, s7 -; GFX9-NEXT: v_mul_f32_e32 v2, v8, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v3 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v7 -; GFX9-NEXT: s_sub_i32 s8, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v8, s8, v2 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v6 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s16, v1 +; GFX9-NEXT: s_xor_b32 s9, s9, s13 +; GFX9-NEXT: s_xor_b32 s6, s4, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX9-NEXT: s_xor_b32 s5, s13, s5 +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v3 -; GFX9-NEXT: s_ashr_i32 s7, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s12 +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v2 +; GFX9-NEXT: s_ashr_i32 s4, s14, 31 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 +; GFX9-NEXT: s_add_i32 s6, s14, s4 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: v_sub_u32_e32 v2, s9, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s12, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 +; GFX9-NEXT: s_sub_i32 s7, 0, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s5, v1 +; GFX9-NEXT: s_ashr_i32 s5, s15, 31 +; GFX9-NEXT: s_add_i32 s9, s15, s5 +; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX9-NEXT: s_xor_b32 s9, s9, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s9 +; GFX9-NEXT: s_ashr_i32 s7, s10, 31 +; GFX9-NEXT: s_add_i32 s8, s10, s7 ; GFX9-NEXT: s_xor_b32 s8, s8, s7 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v8 -; GFX9-NEXT: v_mul_hi_u32 v8, s8, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: s_xor_b32 s6, s5, s6 -; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 -; GFX9-NEXT: v_mul_lo_u32 v7, v8, s9 -; GFX9-NEXT: v_xor_b32_e32 v2, s6, v6 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, s8, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s12, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s6 +; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 +; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s8, v6 +; GFX9-NEXT: s_sub_i32 s8, 0, s9 +; GFX9-NEXT: v_mul_lo_u32 v8, s8, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_subrev_u32_e32 v6, s6, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_subrev_u32_e32 v6, s6, v2 +; GFX9-NEXT: s_ashr_i32 s6, s11, 31 +; GFX9-NEXT: s_add_i32 s8, s11, s6 +; GFX9-NEXT: s_xor_b32 s8, s8, s6 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX9-NEXT: v_mul_hi_u32 v7, s8, v7 ; GFX9-NEXT: s_xor_b32 s4, s7, s4 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v7 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v7, s9 +; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 +; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 +; GFX9-NEXT: s_xor_b32 s4, s6, s5 +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 @@ -1178,11 +1175,12 @@ ; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 +; GFX9-NEXT: v_xor_b32_e32 v6, s7, v6 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, s6, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s6, v7 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] ; GFX9-NEXT: s_endpgm @@ -2566,8 +2564,7 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2576,7 +2573,7 @@ ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_short v[0:1], v4 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -2651,9 +2648,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -2663,7 +2659,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v3 ; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2978,13 +2974,12 @@ ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3323,50 +3318,50 @@ ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s9, 0x7ffffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s1, s1, 0x1b0000 -; GFX8-NEXT: s_ashr_i32 s2, s1, 31 -; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_xor_b32 s3, s1, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX8-NEXT: s_sub_i32 s1, 0, s3 +; GFX8-NEXT: s_ashr_i32 s6, s1, 31 +; GFX8-NEXT: s_add_i32 s1, s1, s6 +; GFX8-NEXT: s_xor_b32 s7, s1, s6 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX8-NEXT: s_sub_i32 s1, 0, s7 ; GFX8-NEXT: s_bfe_i32 s0, s0, 0x1b0000 ; GFX8-NEXT: s_ashr_i32 s8, s0, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_add_i32 s0, s0, s8 -; GFX8-NEXT: s_xor_b32 s0, s0, s8 -; GFX8-NEXT: s_xor_b32 s2, s8, s2 +; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_xor_b32 s4, s8, s6 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e64 v2, s[0:1], s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e64 v2, s[0:1], s3, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_xor_b32_e32 v1, s8, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s8, v1 -; GFX8-NEXT: v_and_b32_e32 v3, s9, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: flat_store_dword v[0:1], v3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_and_b32_e32 v2, s9, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mul_hi_u32 v2, s9, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s8, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -3390,8 +3385,7 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_xor_b32 s5, s8, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x7ffffff +; GFX9-NEXT: s_xor_b32 s4, s8, s6 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 @@ -3407,14 +3401,14 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s5, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -401,10 +401,9 @@ ; GFX7-LABEL: v_shl_v2i64_zext_v2i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_brev_b32 s4, -4 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0x3fffffff, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], 2 @@ -413,10 +412,9 @@ ; GFX8-LABEL: v_shl_v2i64_zext_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -4 -; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 0x3fffffff, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] @@ -425,10 +423,9 @@ ; GFX9-LABEL: v_shl_v2i64_zext_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -4 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0x3fffffff, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] @@ -485,9 +482,8 @@ ; GFX7-LABEL: v_shl_v2i64_sext_v2i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_brev_b32 s4, -8 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 @@ -497,9 +493,8 @@ ; GFX8-LABEL: v_shl_v2i64_sext_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_brev_b32 s4, -8 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -509,9 +504,8 @@ ; GFX9-LABEL: v_shl_v2i64_sext_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_brev_b32 s4, -8 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -659,13 +653,12 @@ ; GFX7-LABEL: v_shl_v2i32_zext_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -714,10 +714,9 @@ ; GFX6-LABEL: v_shl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -826,13 +825,12 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { ; GFX6-LABEL: shl_v2i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -866,10 +864,9 @@ ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -912,21 +909,20 @@ ; GFX6-LABEL: v_shl_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1057,38 +1053,36 @@ ; GFX6-LABEL: v_shl_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v9 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v8, v1 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v10 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v11 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v11 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v8, v3 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v12 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v13 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_mov_b32_e32 v16, 0xffff +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, v8, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v14 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, v8, v6 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v15 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, v2, v16 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, v8, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, v5, v16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v4, v16 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, v7, v16 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, v6, v16 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -568,19 +568,18 @@ ; GISEL-LABEL: v_srem_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 +; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v2 @@ -626,19 +625,18 @@ ; CGP-LABEL: v_srem_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 +; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v2 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_xor_b32_e32 v0, v0, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v5 ; CGP-NEXT: v_xor_b32_e32 v2, v2, v6 ; CGP-NEXT: v_xor_b32_e32 v3, v3, v7 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 @@ -697,9 +695,8 @@ ; GISEL-LABEL: v_srem_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -730,9 +727,8 @@ ; CGP-LABEL: v_srem_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 @@ -765,11 +761,10 @@ ; GISEL-LABEL: v_srem_v2i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 -; GISEL-NEXT: v_and_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_and_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 @@ -825,11 +820,10 @@ ; CGP-LABEL: v_srem_v2i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 -; CGP-NEXT: v_and_b32_e32 v3, s4, v3 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3273,11 +3273,10 @@ ; GISEL-LABEL: v_srem_i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v1, s4, v2 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -3299,10 +3298,9 @@ ; CGP-LABEL: v_srem_i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v1, s4, v2 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_i32_e32 v2, v1 -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; CGP-NEXT: v_cvt_f32_i32_e32 v3, v0 ; CGP-NEXT: v_rcp_f32_e32 v4, v2 ; CGP-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -3327,8 +3325,7 @@ ; GISEL-LABEL: v_srem_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s6, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v1, s6, v4 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 @@ -3337,8 +3334,8 @@ ; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_and_b32_e32 v5, s6, v0 -; GISEL-NEXT: v_and_b32_e32 v6, s6, v6 +; GISEL-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 ; GISEL-NEXT: v_trunc_f32_e32 v4, v4 @@ -3353,7 +3350,7 @@ ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v12 ; GISEL-NEXT: v_mul_lo_u32 v12, v4, v11 ; GISEL-NEXT: v_mul_lo_u32 v13, v7, v10 -; GISEL-NEXT: v_and_b32_e32 v0, s6, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 ; GISEL-NEXT: v_mul_hi_u32 v2, v7, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v4, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 @@ -3597,14 +3594,13 @@ ; CGP-LABEL: v_srem_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v1, s4, v4 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; CGP-NEXT: v_cvt_f32_i32_e32 v3, v1 -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; CGP-NEXT: v_cvt_f32_i32_e32 v4, v0 -; CGP-NEXT: v_and_b32_e32 v6, s4, v6 +; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; CGP-NEXT: v_rcp_f32_e32 v5, v3 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_mul_f32_e32 v5, v4, v5 ; CGP-NEXT: v_trunc_f32_e32 v5, v5 ; CGP-NEXT: v_mad_f32 v4, -v5, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -259,10 +259,9 @@ ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, 0xff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -273,21 +272,19 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 +; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_max_i16_e32 v1, v4, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 +; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 @@ -512,22 +509,21 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -540,40 +536,37 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 -; GFX8-NEXT: v_min_i16_e32 v10, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v10, s5, v10 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_e32 v1, v8, v1 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v10 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v9 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v1, s4, v1 +; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 -; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff ; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_sub_u16_e32 v4, v4, v9 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v6, s5, v6 +; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_max_i16_e32 v5, -1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_sub_u16_e32 v5, v5, v9 +; GFX8-NEXT: v_subrev_u16_e32 v5, 0x7fff, v5 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v4, v5, v4 @@ -619,7 +612,7 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -853,7 +846,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 @@ -2677,19 +2670,17 @@ ; GFX8-LABEL: v_ssubsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v3, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v3, s4, v3 +; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v4, s5, v4 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 ; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v4, s4, v4 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v5, s5, v5 +; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v3 @@ -2816,10 +2807,9 @@ ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -2887,30 +2877,27 @@ ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: ssubsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s2, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: s_movk_i32 s3, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v2, s2, v2 +; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, s3, v3 +; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 ; GFX8-NEXT: v_max_i16_e32 v3, -1, v1 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_subrev_u16_e32 v3, s2, v3 +; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v4, s3, v4 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 ; GFX8-NEXT: v_max_i16_e32 v3, s1, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2 @@ -2987,17 +2974,16 @@ ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3005,32 +2991,30 @@ ; GFX8-LABEL: v_ssubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v6, s4, v6 +; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 ; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v7, s5, v7 +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v2 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 ; GFX8-NEXT: v_max_i16_e32 v7, -1, v4 -; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v4 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 ; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v7, -1, v1 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v8 -; GFX8-NEXT: v_subrev_u16_e32 v7, s4, v7 +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v8, s5, v8 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 ; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 ; GFX8-NEXT: v_min_i16_e32 v7, v7, v8 ; GFX8-NEXT: v_max_i16_e32 v8, -1, v5 -; GFX8-NEXT: v_subrev_u16_e32 v8, s4, v8 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v5 -; GFX8-NEXT: v_subrev_u16_e32 v9, s5, v9 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v6 @@ -3272,24 +3256,23 @@ ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3297,56 +3280,52 @@ ; GFX8-LABEL: v_ssubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v9, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v9, s4, v9 -; GFX8-NEXT: v_min_i16_e32 v11, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_min_i16_e32 v10, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v11, s5, v11 +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10 ; GFX8-NEXT: v_max_i16_e32 v9, v9, v3 -; GFX8-NEXT: v_min_i16_e32 v9, v9, v11 -; GFX8-NEXT: v_max_i16_e32 v11, -1, v6 -; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 -; GFX8-NEXT: v_min_i16_e32 v13, -1, v6 -; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v11, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v13 -; GFX8-NEXT: v_subrev_u16_e32 v11, s4, v11 -; GFX8-NEXT: v_min_i16_e32 v13, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v9, v9, v10 +; GFX8-NEXT: v_max_i16_e32 v10, -1, v6 +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_min_i16_e32 v11, -1, v6 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_max_i16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v10, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v11 +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_min_i16_e32 v11, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v13, s5, v13 -; GFX8-NEXT: v_max_i16_e32 v11, v11, v4 -; GFX8-NEXT: v_min_i16_e32 v11, v11, v13 -; GFX8-NEXT: v_max_i16_e32 v13, -1, v7 -; GFX8-NEXT: v_subrev_u16_e32 v13, s4, v13 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v7 -; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff -; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 -; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v13, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 -; GFX8-NEXT: v_sub_u16_e32 v13, v13, v10 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v2 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_max_i16_e32 v10, v10, v4 +; GFX8-NEXT: v_min_i16_e32 v10, v10, v11 +; GFX8-NEXT: v_max_i16_e32 v11, -1, v7 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_min_i16_e32 v12, -1, v7 +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v11, -1, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v12 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_min_i16_e32 v12, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v14, v14, v12 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v5 -; GFX8-NEXT: v_min_i16_e32 v13, v13, v14 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_sub_u16_e32 v10, v14, v10 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_sub_u16_e32 v12, v14, v12 -; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v5 +; GFX8-NEXT: v_min_i16_e32 v11, v11, v12 +; GFX8-NEXT: v_max_i16_e32 v12, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 ; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v13 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v11 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v10 ; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v13 +; GFX8-NEXT: v_sub_u16_e32 v2, v2, v11 ; GFX8-NEXT: v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -3648,32 +3627,31 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -3681,72 +3659,68 @@ ; GFX8-LABEL: v_ssubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_max_i16_e32 v12, -1, v0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: v_subrev_u16_e32 v12, s4, v12 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v14, s5, v14 +; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 ; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 -; GFX8-NEXT: v_min_i16_e32 v12, v12, v14 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 -; GFX8-NEXT: v_min_i16_e32 v16, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v16 -; GFX8-NEXT: v_subrev_u16_e32 v14, s4, v14 -; GFX8-NEXT: v_min_i16_e32 v16, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v12, v12, v13 +; GFX8-NEXT: v_max_i16_e32 v13, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v8 +; GFX8-NEXT: v_subrev_u16_e32 v14, 0x8000, v14 +; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v13, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 +; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13 +; GFX8-NEXT: v_min_i16_e32 v14, -1, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v16, s5, v16 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 -; GFX8-NEXT: v_min_i16_e32 v14, v14, v16 -; GFX8-NEXT: v_max_i16_e32 v16, -1, v9 -; GFX8-NEXT: v_subrev_u16_e32 v16, s4, v16 -; GFX8-NEXT: v_min_i16_e32 v17, -1, v9 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff -; GFX8-NEXT: v_subrev_u16_e32 v17, s5, v17 -; GFX8-NEXT: v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v16, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v17 -; GFX8-NEXT: v_sub_u16_e32 v16, v16, v13 -; GFX8-NEXT: v_min_i16_e32 v17, -1, v2 +; GFX8-NEXT: v_subrev_u16_e32 v14, 0x8000, v14 +; GFX8-NEXT: v_max_i16_e32 v13, v13, v5 +; GFX8-NEXT: v_min_i16_e32 v13, v13, v14 +; GFX8-NEXT: v_max_i16_e32 v14, -1, v9 +; GFX8-NEXT: v_subrev_u16_e32 v14, 0x7fff, v14 +; GFX8-NEXT: v_min_i16_e32 v15, -1, v9 +; GFX8-NEXT: v_subrev_u16_e32 v15, 0x8000, v15 +; GFX8-NEXT: v_max_i16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v14, -1, v2 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v15 +; GFX8-NEXT: v_subrev_u16_e32 v14, 0x7fff, v14 +; GFX8-NEXT: v_min_i16_e32 v15, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v15 -; GFX8-NEXT: v_max_i16_e32 v16, v16, v6 -; GFX8-NEXT: v_min_i16_e32 v16, v16, v17 -; GFX8-NEXT: v_max_i16_e32 v17, -1, v10 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 -; GFX8-NEXT: v_min_i16_e32 v18, -1, v10 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 -; GFX8-NEXT: v_max_i16_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v17, -1, v3 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v18 -; GFX8-NEXT: v_sub_u16_e32 v17, v17, v13 -; GFX8-NEXT: v_min_i16_e32 v18, -1, v3 +; GFX8-NEXT: v_subrev_u16_e32 v15, 0x8000, v15 +; GFX8-NEXT: v_max_i16_e32 v14, v14, v6 +; GFX8-NEXT: v_min_i16_e32 v14, v14, v15 +; GFX8-NEXT: v_max_i16_e32 v15, -1, v10 +; GFX8-NEXT: v_subrev_u16_e32 v15, 0x7fff, v15 +; GFX8-NEXT: v_min_i16_e32 v16, -1, v10 +; GFX8-NEXT: v_subrev_u16_e32 v16, 0x8000, v16 +; GFX8-NEXT: v_max_i16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v15, -1, v3 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v16 +; GFX8-NEXT: v_subrev_u16_e32 v15, 0x7fff, v15 +; GFX8-NEXT: v_min_i16_e32 v16, -1, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_sub_u16_e32 v18, v18, v15 -; GFX8-NEXT: v_max_i16_e32 v17, v17, v7 -; GFX8-NEXT: v_min_i16_e32 v17, v17, v18 -; GFX8-NEXT: v_max_i16_e32 v18, -1, v11 -; GFX8-NEXT: v_sub_u16_e32 v13, v18, v13 -; GFX8-NEXT: v_min_i16_e32 v18, -1, v11 +; GFX8-NEXT: v_subrev_u16_e32 v16, 0x8000, v16 +; GFX8-NEXT: v_max_i16_e32 v15, v15, v7 +; GFX8-NEXT: v_min_i16_e32 v15, v15, v16 +; GFX8-NEXT: v_max_i16_e32 v16, -1, v11 +; GFX8-NEXT: v_subrev_u16_e32 v16, 0x7fff, v16 +; GFX8-NEXT: v_min_i16_e32 v17, -1, v11 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 ; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_sub_u16_e32 v15, v18, v15 -; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_subrev_u16_e32 v17, 0x8000, v17 +; GFX8-NEXT: v_max_i16_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v14 +; GFX8-NEXT: v_sub_u16_e32 v1, v1, v13 ; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v7, v7, v15 +; GFX8-NEXT: v_min_i16_e32 v7, v7, v17 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v16 +; GFX8-NEXT: v_sub_u16_e32 v2, v2, v14 ; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v17 +; GFX8-NEXT: v_sub_u16_e32 v3, v3, v15 ; GFX8-NEXT: v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll @@ -138,12 +138,11 @@ ; GFX7-LABEL: v_trunc_v4i32_to_v4i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -454,7 +454,7 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -619,7 +619,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -308,9 +308,8 @@ ; GISEL-LABEL: v_udiv_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -354,9 +353,8 @@ ; CGP-LABEL: v_udiv_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 +; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -413,9 +411,8 @@ ; GISEL-LABEL: v_udiv_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -440,9 +437,8 @@ ; CGP-LABEL: v_udiv_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 @@ -477,11 +473,10 @@ ; GISEL-LABEL: v_udiv_v2i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 -; GISEL-NEXT: v_and_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_and_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -525,11 +520,10 @@ ; CGP-LABEL: v_udiv_v2i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 -; CGP-NEXT: v_and_b32_e32 v3, s4, v3 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -1841,9 +1841,8 @@ ; GISEL-LABEL: v_udiv_i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -1869,9 +1868,8 @@ ; CGP-LABEL: v_udiv_i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v2 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v1 @@ -1895,19 +1893,18 @@ ; GISEL-LABEL: v_udiv_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s6, 0xffffff -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; GISEL-NEXT: v_and_b32_e32 v1, s6, v4 -; GISEL-NEXT: v_and_b32_e32 v3, s6, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v1 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v1 ; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 @@ -1919,17 +1916,17 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 ; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 @@ -1939,56 +1936,56 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v18 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s6, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s8, -1, 0x10000 -; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 -; GISEL-NEXT: v_and_b32_e32 v2, s6, v2 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mov_b32_e32 v16, s4 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v18, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_mov_b32_e32 v15, s4 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v20, v18 ; GISEL-NEXT: v_mov_b32_e32 v19, s5 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_mov_b32_e32 v15, s7 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v16 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_mov_b32_e32 v16, s6 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v2 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 @@ -1997,7 +1994,7 @@ ; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 ; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v5, v2, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -2010,12 +2007,12 @@ ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 ; GISEL-NEXT: v_mul_lo_u32 v14, v8, v4 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v9 @@ -2032,7 +2029,7 @@ ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mov_b32_e32 v18, s8 +; GISEL-NEXT: v_mov_b32_e32 v18, s7 ; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 ; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 @@ -2043,122 +2040,121 @@ ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v4 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v17, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v0, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v13, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v14 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v1, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v1, v2 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v13, v1, v2 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v11, v3, v7 ; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v17, v3, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v10 ; GISEL-NEXT: v_mul_lo_u32 v9, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v5 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 ; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 ; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], 1, v7 -; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v6, s[6:7] -; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v11 +; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v5, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v0, v11 ; GISEL-NEXT: v_subb_u32_e64 v11, s[8:9], 0, v10, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v13, v16, v13, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v13, v15, v13, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v15, v16, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v16, v15, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], 1, v12 ; GISEL-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v14, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v10 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[6:7] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v3 ; GISEL-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 -; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 1, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 1, v8 ; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], 0, v17, s[6:7] ; GISEL-NEXT: v_sub_i32_e64 v9, s[6:7], 0, v9 ; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v19, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v19, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v15, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v8, v2, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v15, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v8, v0, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v16, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v6, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_udiv_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s6, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v1, s6, v2 -; CGP-NEXT: v_and_b32_e32 v2, s6, v4 -; CGP-NEXT: v_and_b32_e32 v3, s6, v6 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1 @@ -2179,8 +2175,8 @@ ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v2, s6, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v1 ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -727,7 +727,6 @@ ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x4f7ffffe ; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -739,75 +738,75 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_lo_u32 v3, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX8-NEXT: s_sub_i32 s0, 0, s9 -; GFX8-NEXT: v_mul_lo_u32 v4, s0, v1 -; GFX8-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: v_mul_lo_u32 v3, v0, s8 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s8 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_mul_lo_u32 v5, v1, s9 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v5 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v5, v6 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX8-NEXT: v_mul_f32_e32 v5, v5, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s9, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s8, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s8, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v6 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s5, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s9, v2 ; GFX8-NEXT: s_sub_i32 s0, 0, s10 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s11 -; GFX8-NEXT: v_mul_lo_u32 v7, s0, v5 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX8-NEXT: v_mul_hi_u32 v7, v5, v7 -; GFX8-NEXT: v_mul_f32_e32 v2, v6, v2 -; GFX8-NEXT: v_add_u32_e64 v5, s[0:1], v5, v7 -; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_hi_u32 v7, s6, v5 -; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s9, v3 +; GFX8-NEXT: v_mul_lo_u32 v6, s0, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX8-NEXT: v_mul_hi_u32 v6, v3, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s11 +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v5 +; GFX8-NEXT: v_mul_hi_u32 v3, s6, v3 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s9, v2 +; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX8-NEXT: s_sub_i32 s0, 0, s11 -; GFX8-NEXT: v_mul_lo_u32 v6, s0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v7, s10 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 -; GFX8-NEXT: v_mul_hi_u32 v6, v2, v6 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s6, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v2, v5, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, v3, s10 +; GFX8-NEXT: v_mul_lo_u32 v7, s0, v6 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v3 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s7, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v7 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, v8, s11 -; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s10, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v7 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v8 +; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v2, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v7 +; GFX8-NEXT: v_mul_hi_u32 v7, s7, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, s11 +; GFX8-NEXT: v_subrev_u32_e64 v6, s[0:1], s10, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v7 @@ -827,8 +826,8 @@ ; GFX9-LABEL: udivrem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x4f7ffffe +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 @@ -836,89 +835,87 @@ ; GFX9-NEXT: s_sub_i32 s7, 0, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_lo_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v1 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s0 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_sub_u32_e32 v4, s8, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v6, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, s9, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX9-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s0 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s1 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3 +; GFX9-NEXT: v_sub_u32_e32 v3, s12, v3 +; GFX9-NEXT: v_sub_u32_e32 v7, s13, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s0, v3 ; GFX9-NEXT: s_sub_i32 s0, 0, s2 -; GFX9-NEXT: v_mul_lo_u32 v7, s0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, s0, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; GFX9-NEXT: v_subrev_u32_e32 v6, s1, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v5 +; GFX9-NEXT: v_mul_hi_u32 v2, s14, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v6 ; GFX9-NEXT: s_sub_i32 s0, 0, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX9-NEXT: v_subrev_u32_e32 v8, s1, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, v6, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v2, s2 +; GFX9-NEXT: v_mul_lo_u32 v8, s0, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s1, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, s14, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v3, v8 +; GFX9-NEXT: v_add_u32_e32 v8, 1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 -; GFX9-NEXT: v_mul_hi_u32 v3, s10, v3 -; GFX9-NEXT: v_add_u32_e32 v8, 1, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, s0, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v5 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v7 -; GFX9-NEXT: v_subrev_u32_e32 v8, s1, v5 -; GFX9-NEXT: v_sub_u32_e32 v6, s10, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v2, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, s11, v2 -; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s15, v3 +; GFX9-NEXT: v_subrev_u32_e32 v7, s2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 +; GFX9-NEXT: v_mul_lo_u32 v8, v3, s3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GFX9-NEXT: v_subrev_u32_e32 v7, s2, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_sub_u32_e32 v7, s15, v8 +; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v6 -; GFX9-NEXT: v_mul_lo_u32 v8, v7, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc -; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v3, vcc -; GFX9-NEXT: v_sub_u32_e32 v3, s11, v8 -; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX9-NEXT: v_subrev_u32_e32 v8, s3, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX9-NEXT: v_subrev_u32_e32 v8, s3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v3, 1, v7 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s3, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] -; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v4i32: @@ -2042,16 +2039,15 @@ ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: flat_store_short v[0:1], v4 -; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s10 @@ -2086,7 +2082,6 @@ ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 @@ -2109,7 +2104,7 @@ ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -2117,7 +2112,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v1, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: global_store_short v1, v0, s[2:3] @@ -2308,24 +2303,23 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x14 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s2, s0, 0xffff -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX8-NEXT: s_sub_i32 s1, 0, s2 +; GFX8-NEXT: s_and_b32 s3, s0, 0xffff +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX8-NEXT: s_sub_i32 s1, 0, s3 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s5, s0, 16 +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX8-NEXT: s_sub_i32 s1, 0, s3 +; GFX8-NEXT: s_sub_i32 s1, 0, s2 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -2333,35 +2327,35 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX8-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s2, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3 -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v2 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 -; GFX8-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s8 @@ -2636,7 +2630,6 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_mov_b32 s4, 0x7ffffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0 @@ -2654,11 +2647,11 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 -; GFX8-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7ffffff, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2677,7 +2670,6 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_mov_b32 s4, 0x7ffffff ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 @@ -2693,10 +2685,10 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7ffffff, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -117,14 +117,13 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GISEL-NEXT: s_mov_b32 s4, 0x4f7ffffe ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v4, s4, v4 -; GISEL-NEXT: v_mul_f32_e32 v6, s4, v6 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 @@ -217,9 +216,8 @@ ; CHECK-LABEL: v_urem_v2i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0xfff -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; CHECK-NEXT: v_and_b32_e32 v1, s4, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 0xfff, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0xfff, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i32> %num, ret <2 x i32> %result @@ -257,14 +255,14 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s4, 0x12d8fb ; GISEL-NEXT: v_mov_b32_e32 v2, 0x12d8fb -; GISEL-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v3, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb +; GISEL-NEXT: v_mov_b32_e32 v4, 0xffed2705 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v3 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v0, v3 ; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 ; GISEL-NEXT: v_mul_lo_u32 v4, v4, s4 @@ -351,18 +349,16 @@ ; GISEL-LABEL: v_urem_v2i32_pow2_shl_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s4, 0x1000 -; GISEL-NEXT: s_mov_b32 s5, 0x4f7ffffe -; GISEL-NEXT: v_lshl_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_lshl_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 -; GISEL-NEXT: v_mul_f32_e32 v6, s5, v6 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 @@ -394,9 +390,8 @@ ; CGP-LABEL: v_urem_v2i32_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_movk_i32 s4, 0x1000 -; CGP-NEXT: v_lshl_b32_e32 v2, s4, v2 -; CGP-NEXT: v_lshl_b32_e32 v3, s4, v3 +; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 +; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 @@ -449,9 +444,8 @@ ; GISEL-LABEL: v_urem_i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -474,9 +468,8 @@ ; CGP-LABEL: v_urem_i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; CGP-NEXT: v_rcp_f32_e32 v2, v2 @@ -509,20 +502,18 @@ ; GISEL-LABEL: v_urem_v2i32_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: s_mov_b32 s5, 0x4f7ffffe -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v1 -; GISEL-NEXT: v_and_b32_e32 v2, s4, v2 -; GISEL-NEXT: v_and_b32_e32 v3, s4, v3 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_f32_e32 v4, s5, v4 -; GISEL-NEXT: v_mul_f32_e32 v6, s5, v6 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 @@ -554,11 +545,10 @@ ; CGP-LABEL: v_urem_v2i32_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v1 -; CGP-NEXT: v_and_b32_e32 v2, s4, v2 -; CGP-NEXT: v_and_b32_e32 v3, s4, v3 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -954,9 +954,8 @@ ; CHECK-LABEL: v_urem_v2i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0xfff -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; CHECK-NEXT: v_and_b32_e32 v2, s4, v2 +; CHECK-NEXT: v_and_b32_e32 v0, 0xfff, v0 +; CHECK-NEXT: v_and_b32_e32 v2, 0xfff, v2 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -1101,232 +1100,232 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb ; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 +; GISEL-NEXT: v_mov_b32_e32 v6, v4 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mov_b32_e32 v6, v5 -; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4 +; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v6 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v4 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mov_b32_e32 v5, s4 ; GISEL-NEXT: v_mov_b32_e32 v4, s5 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 ; GISEL-NEXT: s_sub_u32 s9, 0, 0x12d8fb ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 -; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: s_and_b32 s4, s4, 1 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_subb_u32 s10, 0, 0 +; GISEL-NEXT: v_mul_lo_u32 v10, s9, v8 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_lo_u32 v10, s9, v8 -; GISEL-NEXT: v_mov_b32_e32 v11, s4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, s6, v9 -; GISEL-NEXT: v_mul_lo_u32 v13, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, s10, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, s6, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, s7, v6 -; GISEL-NEXT: v_mul_hi_u32 v18, s6, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v11, s6, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v13, s10, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, s9, v6 +; GISEL-NEXT: v_mov_b32_e32 v15, s4 +; GISEL-NEXT: v_mul_lo_u32 v16, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v17, s7, v7 +; GISEL-NEXT: v_mul_hi_u32 v18, s6, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v19, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v16 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v16 ; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v7, v11 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v6, v10 ; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10 -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v10 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v12 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v17, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 +; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v18, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, s9, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, s10, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, s9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, s6, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, s7, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, s6, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, s10, v6 +; GISEL-NEXT: v_mul_hi_u32 v13, s9, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7 +; GISEL-NEXT: v_mul_hi_u32 v16, s6, v7 ; GISEL-NEXT: v_mul_lo_u32 v17, s9, v8 ; GISEL-NEXT: v_mul_lo_u32 v18, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v10 +; GISEL-NEXT: v_mul_hi_u32 v19, v6, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_mul_lo_u32 v17, s6, v9 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v7, v14 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v18, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19 +; GISEL-NEXT: v_mul_hi_u32 v13, v6, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v15 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v14 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v16, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v15 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v1, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v2, v8 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v16 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 +; GISEL-NEXT: v_mul_hi_u32 v12, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, v2, v8 ; GISEL-NEXT: v_mul_lo_u32 v16, v3, v8 ; GISEL-NEXT: v_mul_hi_u32 v17, v2, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 ; GISEL-NEXT: v_mul_lo_u32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v12, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v13, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v16, v7 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v16, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7 -; GISEL-NEXT: v_mul_lo_u32 v15, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, s8, v6 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_mul_lo_u32 v8, s8, v8 ; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v6, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v1, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v13 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v1, v7, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v7 ; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, v15, v6, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_subrev_i32_e32 v6, vcc, s8, v2 +; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v6 +; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_subrev_i32_e32 v11, vcc, s8, v0 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -1334,19 +1333,19 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v10, v19, v10, vcc -; GISEL-NEXT: v_subrev_i32_e32 v13, vcc, s8, v6 +; GISEL-NEXT: v_subrev_i32_e32 v13, vcc, s8, v7 ; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; GISEL-NEXT: v_subrev_i32_e32 v12, vcc, s8, v11 ; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, v11, v12, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] @@ -2373,9 +2372,8 @@ ; GISEL-LABEL: v_urem_i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s4, 0xffffff -; GISEL-NEXT: v_and_b32_e32 v0, s4, v0 -; GISEL-NEXT: v_and_b32_e32 v1, s4, v2 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 @@ -2399,9 +2397,8 @@ ; CGP-LABEL: v_urem_i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s4, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s4, v0 -; CGP-NEXT: v_and_b32_e32 v1, s4, v2 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 ; CGP-NEXT: v_rcp_f32_e32 v4, v3 @@ -2427,19 +2424,18 @@ ; GISEL-LABEL: v_urem_v2i64_24bit: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s6, 0xffffff -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; GISEL-NEXT: v_and_b32_e32 v3, s6, v4 -; GISEL-NEXT: v_and_b32_e32 v1, s6, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3 ; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 ; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 @@ -2451,17 +2447,17 @@ ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11 ; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, v4, v6 +; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12 +; GISEL-NEXT: v_mul_lo_u32 v15, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v16, v4, v12 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7 ; GISEL-NEXT: v_mul_lo_u32 v18, v10, v7 ; GISEL-NEXT: v_mul_hi_u32 v19, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13 ; GISEL-NEXT: v_mul_lo_u32 v15, v11, v17 ; GISEL-NEXT: v_mul_hi_u32 v18, v7, v17 @@ -2471,56 +2467,56 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; GISEL-NEXT: v_mul_lo_u32 v15, v8, v14 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v14 ; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14 ; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v12 -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14 +; GISEL-NEXT: v_add_i32_e64 v16, s[4:5], v6, v16 +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v18 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18 +; GISEL-NEXT: v_mul_lo_u32 v6, v8, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 -; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v18, v12, v16 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v6, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v17 +; GISEL-NEXT: v_mul_hi_u32 v17, v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v6, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6 +; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 +; GISEL-NEXT: s_bfe_i32 s6, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s7, -1, 0x10000 -; GISEL-NEXT: s_bfe_i32 s8, -1, 0x10000 -; GISEL-NEXT: v_and_b32_e32 v0, s6, v0 -; GISEL-NEXT: v_and_b32_e32 v2, s6, v2 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mov_b32_e32 v16, s4 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v18, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_mov_b32_e32 v15, s4 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v20, v18 ; GISEL-NEXT: v_mov_b32_e32 v19, s5 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_mov_b32_e32 v15, s7 +; GISEL-NEXT: v_mul_hi_u32 v16, v8, v16 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_mov_b32_e32 v16, s6 ; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v4, v6 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v4, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v4, v2 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2 +; GISEL-NEXT: v_mul_hi_u32 v14, v4, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 @@ -2529,7 +2525,7 @@ ; GISEL-NEXT: v_mul_lo_u32 v4, v4, v8 ; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v5, v2, v12 ; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 ; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -2542,12 +2538,12 @@ ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v4 +; GISEL-NEXT: v_mul_lo_u32 v10, v2, v4 ; GISEL-NEXT: v_mul_lo_u32 v14, v8, v4 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v6, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v18, v10 ; GISEL-NEXT: v_mul_lo_u32 v18, v11, v9 @@ -2564,7 +2560,7 @@ ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mov_b32_e32 v18, s8 +; GISEL-NEXT: v_mov_b32_e32 v18, s7 ; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 ; GISEL-NEXT: v_mul_hi_u32 v9, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 @@ -2575,118 +2571,117 @@ ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v6, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 -; GISEL-NEXT: v_mul_lo_u32 v12, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v12, v6, v4 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v14, v0, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v17, v2, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, v0, v9 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v10, 0, v9 -; GISEL-NEXT: v_mul_hi_u32 v11, v2, v9 +; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, 0, v9 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v13, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v14 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v14 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v3, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v2 +; GISEL-NEXT: v_mul_lo_u32 v12, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v2, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v10 ; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v4, vcc -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v11 -; GISEL-NEXT: v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v5, s[6:7], 0, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v16, v7, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v15, v9, vcc -; GISEL-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v0, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v5, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v8, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc ; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v2, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v18, v0, vcc ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v5, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v15, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: s_mov_b32 s6, 0xffffff -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v1, s6, v2 -; CGP-NEXT: v_and_b32_e32 v2, s6, v4 -; CGP-NEXT: v_and_b32_e32 v3, s6, v6 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1 @@ -2711,8 +2706,8 @@ ; CGP-NEXT: v_mul_lo_u32 v3, v5, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_and_b32_e32 v0, s6, v0 -; CGP-NEXT: v_and_b32_e32 v2, s6, v1 +; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v1 ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -442,7 +442,7 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, 24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -603,7 +603,7 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: s_mov_b32 s5, 24 ; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 -; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1491,7 +1491,6 @@ ; GFX6-LABEL: udiv_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s3, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s15, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, -1 @@ -1503,9 +1502,9 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s11 -; GFX6-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX6-NEXT: s_sub_i32 s2, 0, s9 @@ -1530,7 +1529,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 -; GFX6-NEXT: v_mul_f32_e32 v2, s3, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 @@ -1546,7 +1545,7 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX6-NEXT: v_mul_f32_e32 v4, s3, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v2, s10 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 @@ -1578,9 +1577,8 @@ ; GFX9-LABEL: udiv_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 @@ -1589,93 +1587,93 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 -; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: s_sub_i32 s2, 0, s10 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, s12, v5 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v5 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, s9 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, s2, v2 +; GFX9-NEXT: s_sub_i32 s2, 0, s11 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, s9 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX9-NEXT: v_add_u32_e32 v8, 1, v1 +; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 -; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 -; GFX9-NEXT: v_mul_f32_e32 v2, s12, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_sub_i32 s2, 0, s11 -; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v6 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, s2, v6 +; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, v2, s10 +; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 -; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 -; GFX9-NEXT: v_mul_lo_u32 v8, v3, s10 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 +; GFX9-NEXT: v_sub_u32_e32 v5, s6, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, s7, v2 -; GFX9-NEXT: v_sub_u32_e32 v6, s6, v8 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s10, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v5, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 +; GFX9-NEXT: v_subrev_u32_e32 v6, s10, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s11 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v6 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 +; GFX9-NEXT: v_sub_u32_e32 v5, s7, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v3 +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_v4i32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX90A-NEXT: s_mov_b32 s3, 0x4f7ffffe -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX90A-NEXT: s_sub_i32 s2, 0, s8 +; GFX90A-NEXT: s_sub_i32 s3, 0, s9 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX90A-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -1688,10 +1686,9 @@ ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX90A-NEXT: s_sub_i32 s2, 0, s9 ; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 -; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX90A-NEXT: v_mul_lo_u32 v2, s3, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s10 @@ -1706,7 +1703,7 @@ ; GFX90A-NEXT: v_subrev_u32_e32 v5, s9, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX90A-NEXT: v_mul_f32_e32 v3, s3, v3 +; GFX90A-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -1718,7 +1715,7 @@ ; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v2 ; GFX90A-NEXT: v_mul_lo_u32 v3, v2, s10 -; GFX90A-NEXT: v_mul_f32_e32 v5, s3, v5 +; GFX90A-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 ; GFX90A-NEXT: v_sub_u32_e32 v3, s6, v3 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX90A-NEXT: v_add_u32_e32 v6, 1, v2 @@ -1879,26 +1876,25 @@ ; GFX6-LABEL: urem_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_i32 s2, 0, s8 -; GFX6-NEXT: s_sub_i32 s12, 0, s9 +; GFX6-NEXT: s_sub_i32 s12, 0, s8 +; GFX6-NEXT: s_sub_i32 s13, 0, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 -; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 @@ -1906,7 +1902,7 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -1926,7 +1922,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: s_sub_i32 s4, 0, s11 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 @@ -1958,53 +1954,52 @@ ; GFX9-LABEL: urem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX9-NEXT: s_sub_i32 s2, 0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_sub_i32 s3, 0, s9 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX9-NEXT: s_sub_i32 s2, 0, s10 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_f32_e32 v2, s12, v5 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v1 +; GFX9-NEXT: s_sub_i32 s2, 0, s10 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s11 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 ; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 ; GFX9-NEXT: s_sub_i32 s2, 0, s11 -; GFX9-NEXT: v_mul_f32_e32 v3, s12, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 ; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 @@ -2036,9 +2031,8 @@ ; GFX90A-LABEL: urem_v4i32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX90A-NEXT: s_mov_b32 s12, 0x4f7ffffe -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX90A-NEXT: s_sub_i32 s2, 0, s8 @@ -2046,9 +2040,9 @@ ; GFX90A-NEXT: s_sub_i32 s3, 0, s9 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX90A-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s12, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -2070,7 +2064,7 @@ ; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX90A-NEXT: v_mul_f32_e32 v2, s12, v2 +; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX90A-NEXT: v_subrev_u32_e32 v3, s9, v1 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 @@ -2090,7 +2084,7 @@ ; GFX90A-NEXT: v_subrev_u32_e32 v5, s10, v2 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX90A-NEXT: v_mul_f32_e32 v3, s12, v3 +; GFX90A-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX90A-NEXT: v_subrev_u32_e32 v5, s10, v2 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 @@ -2287,7 +2281,6 @@ ; GFX6-LABEL: sdiv_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s16, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s15, 0xf000 ; GFX6-NEXT: s_mov_b32 s14, -1 @@ -2303,13 +2296,13 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: s_sub_i32 s1, 0, s3 ; GFX6-NEXT: s_ashr_i32 s0, s4, 31 -; GFX6-NEXT: v_mul_f32_e32 v0, s16, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: s_xor_b32 s2, s0, s2 ; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 ; GFX6-NEXT: s_add_i32 s1, s4, s0 -; GFX6-NEXT: v_mul_f32_e32 v1, s16, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -2343,7 +2336,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX6-NEXT: v_mul_f32_e32 v3, s16, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 @@ -2371,7 +2364,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v2, s4 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; GFX6-NEXT: v_mul_f32_e32 v4, s16, v4 +; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 @@ -2409,29 +2402,28 @@ ; ; GFX9-LABEL: sdiv_v4i32: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s15, 0x4f7ffffe -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s8, 31 -; GFX9-NEXT: s_add_i32 s3, s8, s2 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_ashr_i32 s0, s8, 31 +; GFX9-NEXT: s_add_i32 s1, s8, s0 +; GFX9-NEXT: s_xor_b32 s1, s1, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX9-NEXT: s_ashr_i32 s12, s9, 31 ; GFX9-NEXT: s_add_i32 s9, s9, s12 ; GFX9-NEXT: s_xor_b32 s9, s9, s12 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_sub_i32 s14, 0, s3 +; GFX9-NEXT: s_sub_i32 s14, 0, s1 ; GFX9-NEXT: s_ashr_i32 s8, s4, 31 -; GFX9-NEXT: v_mul_f32_e32 v0, s15, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_add_i32 s4, s4, s8 ; GFX9-NEXT: s_xor_b32 s4, s4, s8 ; GFX9-NEXT: v_mul_lo_u32 v2, s14, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s15, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_sub_i32 s14, 0, s9 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -2442,21 +2434,21 @@ ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 ; GFX9-NEXT: s_xor_b32 s5, s5, s13 -; GFX9-NEXT: s_xor_b32 s2, s8, s2 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 +; GFX9-NEXT: s_xor_b32 s0, s8, s0 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 -; GFX9-NEXT: s_ashr_i32 s3, s10, 31 -; GFX9-NEXT: s_add_i32 s4, s10, s3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2 +; GFX9-NEXT: s_ashr_i32 s1, s10, 31 +; GFX9-NEXT: s_add_i32 s4, s10, s1 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: s_xor_b32 s4, s4, s3 +; GFX9-NEXT: s_xor_b32 s4, s4, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s9 @@ -2466,7 +2458,7 @@ ; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc @@ -2485,17 +2477,17 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v5 ; GFX9-NEXT: s_xor_b32 s6, s6, s5 ; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 -; GFX9-NEXT: s_xor_b32 s2, s13, s12 +; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0 +; GFX9-NEXT: s_xor_b32 s0, s13, s12 ; GFX9-NEXT: v_mul_lo_u32 v5, v2, s4 -; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s2, v1 -; GFX9-NEXT: s_xor_b32 s2, s5, s3 -; GFX9-NEXT: s_sub_i32 s3, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v7, s3, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 +; GFX9-NEXT: s_xor_b32 s0, s5, s1 +; GFX9-NEXT: s_sub_i32 s1, 0, s9 +; GFX9-NEXT: v_mul_lo_u32 v7, s1, v3 ; GFX9-NEXT: v_sub_u32_e32 v5, s6, v5 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 @@ -2503,9 +2495,9 @@ ; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v7 -; GFX9-NEXT: s_ashr_i32 s3, s7, 31 -; GFX9-NEXT: s_add_i32 s5, s7, s3 -; GFX9-NEXT: s_xor_b32 s5, s5, s3 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_add_i32 s5, s7, s1 +; GFX9-NEXT: s_xor_b32 s5, s5, s1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 ; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 @@ -2513,8 +2505,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, v3, s9 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 -; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, s0, v2 ; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc @@ -2522,19 +2514,18 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 -; GFX9-NEXT: s_xor_b32 s2, s3, s8 +; GFX9-NEXT: s_xor_b32 s0, s1, s8 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 +; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v3 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: sdiv_v4i32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX90A-NEXT: s_mov_b32 s13, 0x4f7ffffe -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s2, s8, 31 ; GFX90A-NEXT: s_add_i32 s3, s8, s2 @@ -2547,16 +2538,16 @@ ; GFX90A-NEXT: s_xor_b32 s4, s4, s8 ; GFX90A-NEXT: s_sub_i32 s8, 0, s3 ; GFX90A-NEXT: s_ashr_i32 s12, s9, 31 -; GFX90A-NEXT: v_mul_f32_e32 v0, s13, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: s_add_i32 s9, s9, s12 ; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 ; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX90A-NEXT: v_mul_lo_u32 v1, v0, s3 ; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX90A-NEXT: s_add_i32 s4, s9, s12 -; GFX90A-NEXT: s_xor_b32 s4, s4, s12 +; GFX90A-NEXT: s_xor_b32 s4, s9, s12 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 ; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 @@ -2568,7 +2559,7 @@ ; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s13, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 ; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 @@ -2596,7 +2587,7 @@ ; GFX90A-NEXT: v_add_u32_e32 v3, 1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX90A-NEXT: v_xor_b32_e32 v1, s3, v1 -; GFX90A-NEXT: v_mul_f32_e32 v2, s13, v2 +; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX90A-NEXT: v_subrev_u32_e32 v1, s3, v1 ; GFX90A-NEXT: s_ashr_i32 s3, s6, 31 @@ -2624,7 +2615,7 @@ ; GFX90A-NEXT: v_add_u32_e32 v5, 1, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX90A-NEXT: v_xor_b32_e32 v2, s2, v2 -; GFX90A-NEXT: v_mul_f32_e32 v3, s13, v3 +; GFX90A-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v2 ; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 @@ -2815,7 +2806,6 @@ ; GFX6-LABEL: srem_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s14, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -2823,29 +2813,30 @@ ; GFX6-NEXT: s_add_i32 s8, s8, s2 ; GFX6-NEXT: s_xor_b32 s8, s8, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_ashr_i32 s12, s9, 31 -; GFX6-NEXT: s_add_i32 s9, s9, s12 -; GFX6-NEXT: s_xor_b32 s9, s9, s12 +; GFX6-NEXT: s_ashr_i32 s13, s9, 31 +; GFX6-NEXT: s_add_i32 s9, s9, s13 +; GFX6-NEXT: s_xor_b32 s9, s9, s13 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s13, 0, s8 +; GFX6-NEXT: s_sub_i32 s14, 0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: s_ashr_i32 s12, s4, 31 -; GFX6-NEXT: v_mul_f32_e32 v0, s14, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: s_add_i32 s4, s4, s12 ; GFX6-NEXT: s_xor_b32 s4, s4, s12 -; GFX6-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s14, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s14, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_sub_i32 s13, 0, s9 +; GFX6-NEXT: s_sub_i32 s14, 0, s9 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s13, v1 ; GFX6-NEXT: s_ashr_i32 s13, s5, 31 ; GFX6-NEXT: s_add_i32 s5, s5, s13 +; GFX6-NEXT: s_xor_b32 s5, s5, s13 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s14, v1 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -2853,26 +2844,25 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 -; GFX6-NEXT: s_xor_b32 s4, s5, s13 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: s_ashr_i32 s5, s10, 31 +; GFX6-NEXT: s_ashr_i32 s4, s10, 31 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX6-NEXT: s_add_i32 s8, s10, s5 -; GFX6-NEXT: s_xor_b32 s5, s8, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX6-NEXT: s_add_i32 s8, s10, s4 +; GFX6-NEXT: s_xor_b32 s4, s8, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, s14, v2 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 -; GFX6-NEXT: s_sub_i32 s4, 0, s5 +; GFX6-NEXT: s_sub_i32 s5, 0, s4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 @@ -2880,22 +2870,22 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 ; GFX6-NEXT: s_ashr_i32 s8, s11, 31 ; GFX6-NEXT: s_add_i32 s9, s11, s8 -; GFX6-NEXT: s_ashr_i32 s4, s6, 31 +; GFX6-NEXT: s_ashr_i32 s5, s6, 31 ; GFX6-NEXT: s_xor_b32 s8, s9, s8 -; GFX6-NEXT: s_add_i32 s6, s6, s4 +; GFX6-NEXT: s_add_i32 s6, s6, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX6-NEXT: s_xor_b32 s6, s6, s4 +; GFX6-NEXT: s_xor_b32 s6, s6, s5 ; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, s13, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s13, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s5 -; GFX6-NEXT: v_mul_f32_e32 v3, s14, v3 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v2 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GFX6-NEXT: s_sub_i32 s6, 0, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 @@ -2903,14 +2893,14 @@ ; GFX6-NEXT: s_add_i32 s7, s7, s6 ; GFX6-NEXT: s_xor_b32 s7, s7, s6 ; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v2 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, s5, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, s8 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 @@ -2926,37 +2916,37 @@ ; GFX9-LABEL: srem_v4i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s13, 0x4f7ffffe -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s8, 31 ; GFX9-NEXT: s_add_i32 s8, s8, s2 ; GFX9-NEXT: s_xor_b32 s2, s8, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: s_ashr_i32 s3, s9, 31 -; GFX9-NEXT: s_sub_i32 s12, 0, s2 ; GFX9-NEXT: s_add_i32 s8, s9, s3 +; GFX9-NEXT: s_sub_i32 s12, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_xor_b32 s3, s8, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: s_ashr_i32 s8, s4, 31 -; GFX9-NEXT: v_mul_f32_e32 v0, s13, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_add_i32 s4, s4, s8 ; GFX9-NEXT: s_xor_b32 s4, s4, s8 ; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s13, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_sub_i32 s12, 0, s3 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_ashr_i32 s9, s5, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_ashr_i32 s12, s10, 31 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: s_add_i32 s5, s5, s9 ; GFX9-NEXT: s_xor_b32 s5, s5, s9 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 @@ -2967,9 +2957,8 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX9-NEXT: s_ashr_i32 s2, s10, 31 -; GFX9-NEXT: s_add_i32 s4, s10, s2 -; GFX9-NEXT: s_xor_b32 s2, s4, s2 +; GFX9-NEXT: s_add_i32 s2, s10, s12 +; GFX9-NEXT: s_xor_b32 s2, s2, s12 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 @@ -2979,7 +2968,7 @@ ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GFX9-NEXT: v_mul_f32_e32 v2, s13, v2 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 @@ -2998,7 +2987,7 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: s_xor_b32 s5, s5, s4 ; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, s13, v5 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 @@ -3035,112 +3024,111 @@ ; ; GFX90A-LABEL: srem_v4i32: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX90A-NEXT: s_mov_b32 s12, 0x4f7ffffe -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_ashr_i32 s2, s8, 31 -; GFX90A-NEXT: s_add_i32 s3, s8, s2 -; GFX90A-NEXT: s_xor_b32 s2, s3, s2 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX90A-NEXT: s_ashr_i32 s0, s8, 31 +; GFX90A-NEXT: s_add_i32 s1, s8, s0 +; GFX90A-NEXT: s_xor_b32 s0, s1, s0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX90A-NEXT: s_ashr_i32 s8, s9, 31 ; GFX90A-NEXT: s_add_i32 s9, s9, s8 ; GFX90A-NEXT: s_xor_b32 s8, s9, s8 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX90A-NEXT: s_sub_i32 s9, 0, s2 -; GFX90A-NEXT: s_ashr_i32 s3, s4, 31 -; GFX90A-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX90A-NEXT: s_sub_i32 s9, 0, s0 +; GFX90A-NEXT: s_ashr_i32 s1, s4, 31 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: s_add_i32 s4, s4, s3 +; GFX90A-NEXT: s_add_i32 s4, s4, s1 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX90A-NEXT: s_xor_b32 s4, s4, s3 +; GFX90A-NEXT: s_xor_b32 s4, s4, s1 ; GFX90A-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s2 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 ; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s12, v1 -; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s0, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s0, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX90A-NEXT: s_sub_i32 s4, 0, s8 -; GFX90A-NEXT: v_xor_b32_e32 v0, s3, v0 -; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 +; GFX90A-NEXT: v_xor_b32_e32 v0, s1, v0 +; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 ; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v1 -; GFX90A-NEXT: v_subrev_u32_e32 v0, s3, v0 -; GFX90A-NEXT: s_add_i32 s3, s5, s2 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s1, v0 +; GFX90A-NEXT: s_add_i32 s1, s5, s0 ; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX90A-NEXT: s_xor_b32 s3, s3, s2 +; GFX90A-NEXT: s_xor_b32 s1, s1, s0 ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX90A-NEXT: v_mul_hi_u32 v1, s3, v1 +; GFX90A-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s8 -; GFX90A-NEXT: v_sub_u32_e32 v1, s3, v1 -; GFX90A-NEXT: s_ashr_i32 s3, s10, 31 -; GFX90A-NEXT: s_add_i32 s4, s10, s3 +; GFX90A-NEXT: v_sub_u32_e32 v1, s1, v1 +; GFX90A-NEXT: s_ashr_i32 s1, s10, 31 +; GFX90A-NEXT: s_add_i32 s4, s10, s1 ; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v1 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 -; GFX90A-NEXT: s_xor_b32 s3, s4, s3 +; GFX90A-NEXT: s_xor_b32 s1, s4, s1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v1 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX90A-NEXT: v_xor_b32_e32 v1, s2, v1 -; GFX90A-NEXT: s_sub_i32 s5, 0, s3 -; GFX90A-NEXT: v_subrev_u32_e32 v1, s2, v1 -; GFX90A-NEXT: v_mul_f32_e32 v2, s12, v2 +; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX90A-NEXT: s_sub_i32 s5, 0, s1 +; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v1 +; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX90A-NEXT: s_ashr_i32 s2, s6, 31 -; GFX90A-NEXT: s_add_i32 s4, s6, s2 -; GFX90A-NEXT: s_xor_b32 s4, s4, s2 +; GFX90A-NEXT: s_ashr_i32 s0, s6, 31 +; GFX90A-NEXT: s_add_i32 s4, s6, s0 +; GFX90A-NEXT: s_xor_b32 s4, s4, s0 ; GFX90A-NEXT: v_mul_lo_u32 v3, s5, v2 ; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v2 -; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s1 ; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 ; GFX90A-NEXT: s_ashr_i32 s4, s11, 31 ; GFX90A-NEXT: s_add_i32 s5, s11, s4 -; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v2 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v2 ; GFX90A-NEXT: s_xor_b32 s4, s5, s4 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX90A-NEXT: v_subrev_u32_e32 v5, s3, v2 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX90A-NEXT: v_subrev_u32_e32 v5, s1, v2 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v2 ; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX90A-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX90A-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX90A-NEXT: s_sub_i32 s5, 0, s4 -; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v2 -; GFX90A-NEXT: v_mul_f32_e32 v3, s12, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v2, s0, v2 +; GFX90A-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 -; GFX90A-NEXT: s_add_i32 s3, s7, s2 -; GFX90A-NEXT: s_xor_b32 s3, s3, s2 +; GFX90A-NEXT: s_ashr_i32 s0, s7, 31 +; GFX90A-NEXT: s_add_i32 s1, s7, s0 +; GFX90A-NEXT: s_xor_b32 s1, s1, s0 ; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 ; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 ; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX90A-NEXT: v_mul_hi_u32 v3, s3, v3 +; GFX90A-NEXT: v_mul_hi_u32 v3, s1, v3 ; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 -; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 +; GFX90A-NEXT: v_sub_u32_e32 v3, s1, v3 ; GFX90A-NEXT: v_subrev_u32_e32 v5, s4, v3 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX90A-NEXT: v_subrev_u32_e32 v5, s4, v3 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX90A-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v3 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX90A-NEXT: v_xor_b32_e32 v3, s0, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX90A-NEXT: s_endpgm %r = srem <4 x i32> %x, %y store <4 x i32> %r, <4 x i32> addrspace(1)* %out @@ -3236,20 +3224,19 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s9, s2, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_lshr_b32 s9, s0, 16 +; GFX6-NEXT: s_lshr_b32 s8, s0, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 -; GFX6-NEXT: s_and_b32 s2, s3, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 +; GFX6-NEXT: s_and_b32 s0, s3, 0xffff ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 @@ -3260,18 +3247,18 @@ ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: s_lshr_b32 s10, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX6-NEXT: s_lshr_b32 s2, s3, 16 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX6-NEXT: s_lshr_b32 s9, s1, 16 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v3 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 @@ -3285,9 +3272,9 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -3342,10 +3329,9 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -3400,10 +3386,9 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: v_mad_f32 v5, -v5, v4, v7 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc -; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -3510,19 +3495,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s9, s2, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX6-NEXT: s_and_b32 s10, s0, 0xffff -; GFX6-NEXT: s_lshr_b32 s11, s2, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 +; GFX6-NEXT: s_lshr_b32 s10, s2, 16 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s11 -; GFX6-NEXT: s_lshr_b32 s9, s0, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 +; GFX6-NEXT: s_lshr_b32 s8, s0, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 @@ -3540,15 +3524,15 @@ ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX6-NEXT: s_and_b32 s2, s1, 0xffff -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_lshr_b32 s12, s3, 16 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 +; GFX6-NEXT: s_lshr_b32 s11, s3, 16 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s8, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s12 -; GFX6-NEXT: s_lshr_b32 s10, s1, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s11 +; GFX6-NEXT: s_lshr_b32 s9, s1, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s9 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 @@ -3563,12 +3547,12 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s12 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 @@ -3625,17 +3609,16 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s9 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_sub_u32_e32 v5, s0, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 ; GFX9-NEXT: v_sub_u32_e32 v3, s1, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -3698,10 +3681,9 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc ; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s9 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX90A-NEXT: v_sub_u32_e32 v4, s1, v4 -; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 ; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -3883,12 +3865,11 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: s_mov_b32 s0, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -3962,11 +3943,10 @@ ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 -; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v5, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -4040,11 +4020,10 @@ ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX90A-NEXT: v_add_u32_e32 v0, s0, v6 -; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX90A-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, v5, v3 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_endpgm @@ -4206,6 +4185,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s2 ; GFX6-NEXT: s_sext_i32_i16 s2, s3 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 ; GFX6-NEXT: s_sext_i32_i16 s0, s1 ; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 @@ -4240,13 +4220,11 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 -; GFX6-NEXT: s_mov_b32 s0, 0xffff +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -4327,12 +4305,11 @@ ; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 ; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s7, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v3, v4, v5 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm @@ -4414,10 +4391,9 @@ ; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 ; GFX90A-NEXT: v_add_u32_e32 v3, s0, v6 ; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 -; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v3 -; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX90A-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] @@ -5134,45 +5110,44 @@ ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_and_b32 s6, s0, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: s_and_b32 s6, s2, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: v_alignbit_b32 v4, s1, v4, 16 -; GFX6-NEXT: v_and_b32_e32 v5, s8, v4 +; GFX6-NEXT: s_and_b32 s8, s0, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX6-NEXT: v_mad_f32 v3, -v4, v0, v3 ; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 -; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 +; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mad_f32 v3, -v4, v5, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: s_and_b32 s0, s3, 0xffff -; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX6-NEXT: v_mad_f32 v3, -v5, v2, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 ; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 @@ -5182,7 +5157,7 @@ ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -5929,30 +5904,29 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff -; GFX6-NEXT: s_and_b32 s9, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: s_and_b32 s3, s2, 0x7fff ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 @@ -5966,9 +5940,9 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 ; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 -; GFX6-NEXT: v_and_b32_e32 v2, s3, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GFX6-NEXT: v_and_b32_e32 v3, s3, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -5984,7 +5958,6 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff @@ -6000,7 +5973,7 @@ ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 @@ -6009,7 +5982,7 @@ ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 -; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 @@ -6023,9 +5996,9 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 ; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 -; GFX9-NEXT: v_and_b32_e32 v3, s8, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v4, s8, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 @@ -6040,7 +6013,6 @@ ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_and_b32 s0, s4, 0x7fff @@ -6056,7 +6028,7 @@ ; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX90A-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 @@ -6065,7 +6037,7 @@ ; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 -; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mad_f32 v5, -v1, v6, v7 @@ -6079,8 +6051,8 @@ ; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v1 ; GFX90A-NEXT: v_mad_f32 v0, -v1, v3, v0 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 -; GFX90A-NEXT: v_and_b32_e32 v3, s8, v4 -; GFX90A-NEXT: v_and_b32_e32 v4, s8, v5 +; GFX90A-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, 0x7fff, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] @@ -6174,33 +6146,32 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s2, 0x7fff -; GFX6-NEXT: s_and_b32 s10, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 +; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff +; GFX6-NEXT: s_and_b32 s9, s0, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f +; GFX6-NEXT: s_bfe_u32 s9, s2, 0xf000f ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 @@ -6217,12 +6188,12 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: s_lshr_b32 s8, s2, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 +; GFX6-NEXT: s_lshr_b32 s3, s2, 15 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v1 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v2, s3, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -6237,7 +6208,6 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -6256,13 +6226,13 @@ ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 @@ -6285,9 +6255,9 @@ ; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 ; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, s8, v5 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 @@ -6301,12 +6271,11 @@ ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_and_b32 s1, s4, 0x7fff -; GFX90A-NEXT: s_and_b32 s9, s6, 0x7fff -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX90A-NEXT: s_and_b32 s8, s6, 0x7fff +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 ; GFX90A-NEXT: v_alignbit_b32 v3, s7, v3, 30 @@ -6324,7 +6293,7 @@ ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX90A-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX90A-NEXT: v_sub_u32_e32 v4, s4, v1 ; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 @@ -6332,7 +6301,7 @@ ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mad_f32 v7, -v1, v6, v7 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 @@ -6348,8 +6317,8 @@ ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX90A-NEXT: v_and_b32_e32 v3, s8, v4 -; GFX90A-NEXT: v_and_b32_e32 v4, s8, v6 +; GFX90A-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, 0x7fff, v6 ; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] @@ -6499,11 +6468,10 @@ ; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: s_movk_i32 s0, 0x7fff +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -6570,10 +6538,9 @@ ; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 -; GFX9-NEXT: v_and_b32_e32 v3, s0, v4 -; GFX9-NEXT: v_and_b32_e32 v4, s0, v5 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 @@ -6639,10 +6606,9 @@ ; GFX90A-NEXT: v_cvt_i32_f32_e32 v7, v1 ; GFX90A-NEXT: v_mad_f32 v1, -v1, v3, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| -; GFX90A-NEXT: s_movk_i32 s0, 0x7fff ; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX90A-NEXT: v_and_b32_e32 v3, s0, v4 -; GFX90A-NEXT: v_and_b32_e32 v4, s0, v5 +; GFX90A-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, 0x7fff, v5 ; GFX90A-NEXT: v_add_u32_e32 v0, v7, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] @@ -6748,17 +6714,17 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s2, 0x7fff -; GFX6-NEXT: s_and_b32 s11, s0, 0x7fff -; GFX6-NEXT: s_bfe_i32 s11, s11, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s11 -; GFX6-NEXT: s_bfe_i32 s9, s9, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s9 -; GFX6-NEXT: s_xor_b32 s9, s9, s11 +; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff +; GFX6-NEXT: s_and_b32 s10, s0, 0x7fff +; GFX6-NEXT: s_bfe_i32 s10, s10, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s10 +; GFX6-NEXT: s_bfe_i32 s8, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s8 +; GFX6-NEXT: s_xor_b32 s8, s8, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_ashr_i32 s9, s9, 30 -; GFX6-NEXT: s_or_b32 s9, s9, 1 -; GFX6-NEXT: v_mov_b32_e32 v5, s9 +; GFX6-NEXT: s_ashr_i32 s8, s8, 30 +; GFX6-NEXT: s_or_b32 s8, s8, 1 +; GFX6-NEXT: v_mov_b32_e32 v5, s8 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 @@ -6766,37 +6732,36 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_bfe_u32 s12, s0, 0xf000f +; GFX6-NEXT: s_bfe_u32 s11, s0, 0xf000f ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 ; GFX6-NEXT: s_lshr_b32 s1, s0, 15 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 -; GFX6-NEXT: s_bfe_i32 s0, s12, 0xf0000 +; GFX6-NEXT: s_bfe_i32 s0, s11, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f ; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_lshr_b32 s8, s2, 15 +; GFX6-NEXT: s_bfe_u32 s9, s2, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: s_lshr_b32 s3, s2, 15 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 +; GFX6-NEXT: s_bfe_i32 s2, s9, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: s_xor_b32 s0, s2, s0 ; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: s_or_b32 s0, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: s_or_b32 s0, s0, 1 -; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX6-NEXT: v_mov_b32_e32 v6, s0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GFX6-NEXT: v_bfe_i32 v4, v1, 0, 15 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_bfe_i32 v6, v0, 0, 15 ; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v5 @@ -6812,11 +6777,11 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 -; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 @@ -6831,7 +6796,6 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff ; GFX9-NEXT: s_and_b32 s1, s6, 0x7fff @@ -6848,18 +6812,18 @@ ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 ; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX9-NEXT: s_lshr_b32 s9, s4, 15 +; GFX9-NEXT: s_lshr_b32 s8, s4, 15 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 ; GFX9-NEXT: s_bfe_u32 s5, s4, 0xf000f ; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 ; GFX9-NEXT: s_lshr_b32 s7, s6, 15 -; GFX9-NEXT: s_bfe_u32 s10, s6, 0xf000f -; GFX9-NEXT: s_or_b32 s11, s0, 1 +; GFX9-NEXT: s_bfe_u32 s9, s6, 0xf000f +; GFX9-NEXT: s_or_b32 s10, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s11, 0 +; GFX9-NEXT: s_cselect_b32 s0, s10, 0 ; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 -; GFX9-NEXT: s_bfe_i32 s0, s10, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s9, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX9-NEXT: s_bfe_i32 s1, s5, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 @@ -6867,7 +6831,7 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_or_b32 s5, s0, 1 -; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 @@ -6878,7 +6842,7 @@ ; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 15 ; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v4 -; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9-NEXT: v_bfe_i32 v6, v0, 0, 15 ; GFX9-NEXT: v_cvt_f32_i32_e32 v7, v6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 @@ -6895,12 +6859,12 @@ ; GFX9-NEXT: v_add_u32_e32 v4, v8, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, s6 ; GFX9-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 -; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 @@ -6915,7 +6879,6 @@ ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_and_b32 s0, s4, 0x7fff @@ -6933,20 +6896,20 @@ ; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 ; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX90A-NEXT: s_lshr_b32 s8, s4, 15 ; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX90A-NEXT: s_lshr_b32 s5, s4, 15 -; GFX90A-NEXT: s_bfe_u32 s9, s4, 0xf000f +; GFX90A-NEXT: s_bfe_u32 s5, s4, 0xf000f ; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 ; GFX90A-NEXT: s_lshr_b32 s7, s6, 15 -; GFX90A-NEXT: s_bfe_u32 s10, s6, 0xf000f -; GFX90A-NEXT: s_or_b32 s11, s0, 1 +; GFX90A-NEXT: s_bfe_u32 s9, s6, 0xf000f +; GFX90A-NEXT: s_or_b32 s10, s0, 1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX90A-NEXT: s_cselect_b32 s0, s11, 0 +; GFX90A-NEXT: s_cselect_b32 s0, s10, 0 ; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 -; GFX90A-NEXT: s_bfe_i32 s0, s10, 0xf0000 +; GFX90A-NEXT: s_bfe_i32 s0, s9, 0xf0000 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 -; GFX90A-NEXT: s_bfe_i32 s1, s9, 0xf0000 +; GFX90A-NEXT: s_bfe_i32 s1, s5, 0xf0000 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s1 ; GFX90A-NEXT: s_xor_b32 s0, s1, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 @@ -6959,13 +6922,13 @@ ; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX90A-NEXT: s_or_b32 s4, s0, 1 ; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| -; GFX90A-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX90A-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 ; GFX90A-NEXT: v_bfe_i32 v5, v1, 0, 15 ; GFX90A-NEXT: v_add_u32_e32 v4, s0, v6 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v5 -; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX90A-NEXT: v_bfe_i32 v7, v0, 0, 15 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v8, v7 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v6 @@ -6979,12 +6942,12 @@ ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| ; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s7 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GFX90A-NEXT: v_sub_u32_e32 v4, s5, v4 +; GFX90A-NEXT: v_sub_u32_e32 v4, s8, v4 ; GFX90A-NEXT: v_add_u32_e32 v5, v9, v5 ; GFX90A-NEXT: v_mul_lo_u32 v1, v5, v1 -; GFX90A-NEXT: v_and_b32_e32 v4, s8, v4 +; GFX90A-NEXT: v_and_b32_e32 v4, 0x7fff, v4 ; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 -; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 +; GFX90A-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] ; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 @@ -7345,22 +7308,21 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s8, 0x1000, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_lshl_b32 s9, 0x1000, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX6-NEXT: s_lshl_b32 s9, 0x1000, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 +; GFX6-NEXT: s_lshl_b32 s8, 0x1000, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s0, 0x4f7ffffe +; GFX6-NEXT: s_sub_i32 s0, 0, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 -; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX6-NEXT: s_sub_i32 s0, 0, s9 +; GFX6-NEXT: s_sub_i32 s0, 0, s8 ; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -7369,25 +7331,25 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s3, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s9 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s8 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -7398,45 +7360,44 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s2, 0, s7 ; GFX9-NEXT: s_sub_i32 s3, 0, s6 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s7 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, s6 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s2, v3 +; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v3 -; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1] +; GFX9-NEXT: v_subrev_u32_e32 v6, s6, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 @@ -7452,24 +7413,23 @@ ; GFX90A-NEXT: s_lshl_b32 s6, 0x1000, s2 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX90A-NEXT: s_lshl_b32 s7, 0x1000, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_mov_b32 s1, 0x4f7ffffe ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX90A-NEXT: s_sub_i32 s0, 0, s6 -; GFX90A-NEXT: v_mul_f32_e32 v0, s1, v0 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v0 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s1, v1 ; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -7784,7 +7744,6 @@ ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s5, 0x4f7ffffe ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7795,9 +7754,9 @@ ; GFX6-NEXT: s_sub_i32 s4, 0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s5, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s5, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v0 ; GFX6-NEXT: s_sub_i32 s4, 0, s3 @@ -7815,12 +7774,12 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -7838,15 +7797,14 @@ ; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe +; GFX9-NEXT: s_sub_i32 s2, 0, s5 ; GFX9-NEXT: s_sub_i32 s3, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s5 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c @@ -7880,47 +7838,47 @@ ; ; GFX90A-LABEL: urem_v2i32_pow2_shl_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_mov_b32 s8, 0x4f7ffffe +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX90A-NEXT: s_lshl_b32 s1, 0x1000, s7 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX90A-NEXT: s_sub_i32 s6, 0, s0 +; GFX90A-NEXT: s_lshl_b32 s7, 0x1000, s2 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX90A-NEXT: s_lshl_b32 s6, 0x1000, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: s_sub_i32 s7, 0, s1 +; GFX90A-NEXT: s_sub_i32 s0, 0, s7 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX90A-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX90A-NEXT: s_sub_i32 s1, 0, s6 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s8, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v0 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s7 ; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s7, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s7, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v4 ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s6 ; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s6, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_endpgm @@ -8407,7 +8365,6 @@ ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s12, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb ; GFX6-NEXT: s_mov_b32 s7, 0xf000 @@ -8420,23 +8377,24 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: s_sub_i32 s11, 0, s2 ; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s3 -; GFX6-NEXT: s_ashr_i32 s3, s0, 31 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_add_i32 s0, s0, s3 ; GFX6-NEXT: s_ashr_i32 s1, s8, 31 -; GFX6-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_add_i32 s3, s8, s1 +; GFX6-NEXT: s_ashr_i32 s8, s0, 31 +; GFX6-NEXT: s_add_i32 s0, s0, s8 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_xor_b32 s10, s1, s10 ; GFX6-NEXT: v_mul_lo_u32 v1, s11, v0 -; GFX6-NEXT: s_xor_b32 s11, s0, s3 +; GFX6-NEXT: s_xor_b32 s11, s0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX6-NEXT: s_add_i32 s0, s8, s1 +; GFX6-NEXT: s_xor_b32 s0, s3, s1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s3, 0, s11 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_xor_b32 s8, s1, s10 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s12, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 @@ -8445,8 +8403,7 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX6-NEXT: s_sub_i32 s0, 0, s11 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 +; GFX6-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: s_add_i32 s1, s9, s0 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 @@ -8456,15 +8413,15 @@ ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: s_xor_b32 s2, s0, s3 +; GFX6-NEXT: s_xor_b32 s2, s0, s8 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s11 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s11, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 @@ -8479,34 +8436,34 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s10, 0x4f7ffffe ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 ; GFX9-NEXT: s_ashr_i32 s1, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s1 ; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 -; GFX9-NEXT: s_ashr_i32 s8, s6, 31 -; GFX9-NEXT: s_add_i32 s6, s6, s8 +; GFX9-NEXT: s_ashr_i32 s9, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s6, s8 +; GFX9-NEXT: s_xor_b32 s6, s6, s9 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_sub_i32 s9, 0, s0 -; GFX9-NEXT: v_mul_f32_e32 v0, s10, v0 +; GFX9-NEXT: s_sub_i32 s10, 0, s0 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_ashr_i32 s7, s4, 31 ; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: v_mul_lo_u32 v3, s9, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s10, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s10, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_xor_b32 s4, s4, s7 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 ; GFX9-NEXT: s_sub_i32 s10, 0, s6 -; GFX9-NEXT: s_ashr_i32 s9, s5, 31 -; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_add_i32 s5, s5, s8 +; GFX9-NEXT: s_xor_b32 s5, s5, s8 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 @@ -8515,10 +8472,9 @@ ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: s_xor_b32 s4, s5, s9 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 -; GFX9-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_subrev_u32_e32 v5, s0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc @@ -8527,9 +8483,9 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s0, s9, s8 +; GFX9-NEXT: s_xor_b32 s0, s8, s9 ; GFX9-NEXT: v_xor_b32_e32 v0, s1, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 @@ -8548,7 +8504,6 @@ ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c -; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_lshl_b32 s2, 0x1000, s2 @@ -8563,7 +8518,7 @@ ; GFX90A-NEXT: s_xor_b32 s6, s1, s8 ; GFX90A-NEXT: s_xor_b32 s1, s3, s1 ; GFX90A-NEXT: s_sub_i32 s3, 0, s2 -; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -8585,7 +8540,7 @@ ; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 ; GFX90A-NEXT: s_add_i32 s3, s7, s2 ; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: s_xor_b32 s1, s2, s1 ; GFX90A-NEXT: s_xor_b32 s2, s3, s2 @@ -9019,9 +8974,7 @@ ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s11, 0x4f7ffffe ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 ; GFX6-NEXT: s_ashr_i32 s4, s2, 31 @@ -9029,30 +8982,31 @@ ; GFX6-NEXT: s_xor_b32 s2, s2, s4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX6-NEXT: s_ashr_i32 s9, s3, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s9 +; GFX6-NEXT: s_ashr_i32 s6, s3, 31 +; GFX6-NEXT: s_add_i32 s3, s3, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s10, 0, s2 -; GFX6-NEXT: s_xor_b32 s3, s3, s9 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 -; GFX6-NEXT: v_mul_f32_e32 v0, s11, v0 +; GFX6-NEXT: s_xor_b32 s3, s3, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: s_sub_i32 s6, 0, s2 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: v_mul_lo_u32 v1, s10, v0 -; GFX6-NEXT: s_sub_i32 s10, 0, s3 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX6-NEXT: s_sub_i32 s6, 0, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s8, s0, 31 ; GFX6-NEXT: s_add_i32 s0, s0, s8 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: s_xor_b32 s0, s0, s8 -; GFX6-NEXT: s_ashr_i32 s9, s1, 31 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s11, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX6-NEXT: s_ashr_i32 s9, s1, 31 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 @@ -9083,65 +9037,65 @@ ; ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s8, 0x4f7ffffe -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 -; GFX9-NEXT: s_ashr_i32 s1, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_lshl_b32 s1, 0x1000, s7 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s1, s1, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX9-NEXT: s_sub_i32 s7, 0, s0 +; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s4 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 -; GFX9-NEXT: v_mul_f32_e32 v0, s8, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s4, s4, s6 +; GFX9-NEXT: s_lshl_b32 s5, 0x1000, s5 ; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: v_mul_f32_e32 v1, s8, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, s7, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s7, 0, s1 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v1 ; GFX9-NEXT: s_ashr_i32 s7, s5, 31 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s7 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: s_xor_b32 s5, s5, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s8, 0, s4 +; GFX9-NEXT: s_ashr_i32 s6, s2, 31 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_add_i32 s2, s2, s6 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s8, v0 +; GFX9-NEXT: s_sub_i32 s8, 0, s5 +; GFX9-NEXT: s_xor_b32 s2, s2, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: s_ashr_i32 s7, s3, 31 +; GFX9-NEXT: s_add_i32 s3, s3, s7 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX9-NEXT: s_xor_b32 s3, s3, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 +; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v1, s3, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: srem_v2i32_pow2_shl_denom: @@ -9149,7 +9103,6 @@ ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_mov_b32 s8, 0x4f7ffffe ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s6 @@ -9162,36 +9115,34 @@ ; GFX90A-NEXT: s_add_i32 s1, s1, s7 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: s_xor_b32 s1, s1, s7 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX90A-NEXT: s_sub_i32 s7, 0, s0 -; GFX90A-NEXT: s_ashr_i32 s6, s4, 31 -; GFX90A-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: s_ashr_i32 s6, s4, 31 ; GFX90A-NEXT: s_add_i32 s4, s4, s6 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v0 +; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 ; GFX90A-NEXT: s_xor_b32 s4, s4, s6 -; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v0 -; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 ; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 ; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v0 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 -; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v0 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 ; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX90A-NEXT: s_add_i32 s4, s5, s0 ; GFX90A-NEXT: s_sub_i32 s5, 0, s1 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX90A-NEXT: v_mul_f32_e32 v1, s8, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: s_xor_b32 s4, s4, s0 -; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX90A-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX90A-NEXT: v_mul_lo_u32 v3, s5, v1 ; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX90A-NEXT: s_xor_b32 s4, s4, s0 ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX90A-NEXT: v_mul_hi_u32 v1, s4, v1 ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 @@ -9202,7 +9153,9 @@ ; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 ; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v1 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_endpgm @@ -12049,9 +12002,6 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 ; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 -; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc -; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -12062,42 +12012,40 @@ ; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 ; GFX6-NEXT: s_sub_u32 s6, 0, s10 ; GFX6-NEXT: s_subb_u32 s7, 0, s11 -; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 +; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_ashr_i32 s14, s1, 31 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 +; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s14, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s14 +; GFX6-NEXT: s_mov_b32 s15, s14 ; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v0 -; GFX6-NEXT: s_mov_b32 s15, s14 +; GFX6-NEXT: v_mul_lo_u32 v5, s7, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s6, v0 +; GFX6-NEXT: s_addc_u32 s1, s1, s14 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX6-NEXT: s_addc_u32 s1, s1, s14 -; GFX6-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc @@ -12106,8 +12054,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 +; GFX6-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] ; GFX6-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] -; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s6, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 @@ -12136,8 +12084,8 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, s17, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s17, v0 +; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s8, s8, s12 -; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -12147,7 +12095,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NEXT: s_addc_u32 s9, s9, s12 +; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -12166,6 +12114,7 @@ ; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] ; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 ; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] +; GFX6-NEXT: s_addc_u32 s9, s9, s12 ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] ; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] @@ -12178,16 +12127,16 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 -; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 +; GFX6-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX6-NEXT: v_rcp_f32_e32 v3, v8 ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v3, s19, v3 -; GFX6-NEXT: v_mul_f32_e32 v4, s20, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mac_f32_e32 v3, s21, v4 +; GFX6-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: s_sub_u32 s0, 0, s8 @@ -12312,9 +12261,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 -; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc -; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -12325,20 +12271,19 @@ ; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 ; GFX9-NEXT: s_sub_u32 s2, 0, s10 ; GFX9-NEXT: s_subb_u32 s3, 0, s11 -; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 +; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s14, s5, 31 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 +; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s14, s5, 31 ; GFX9-NEXT: s_mov_b32 s15, s14 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 @@ -12348,16 +12293,16 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -12438,16 +12383,16 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 -; GFX9-NEXT: v_mac_f32_e32 v7, s16, v8 +; GFX9-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 ; GFX9-NEXT: v_rcp_f32_e32 v7, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, s17, v7 -; GFX9-NEXT: v_mul_f32_e32 v4, s18, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v7 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mac_f32_e32 v3, s19, v4 +; GFX9-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: s_sub_u32 s10, 0, s8 @@ -12569,9 +12514,7 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX90A-NEXT: s_mov_b32 s16, 0x4f800000 -; GFX90A-NEXT: s_mov_b32 s17, 0x5f7ffffc -; GFX90A-NEXT: s_mov_b32 s18, 0x2f800000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -12582,28 +12525,26 @@ ; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX90A-NEXT: s_sub_u32 s0, 0, s12 -; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 -; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_subb_u32 s1, 0, s13 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 ; GFX90A-NEXT: s_mov_b32 s15, s14 -; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 -; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v1 -; GFX90A-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 @@ -12703,15 +12644,15 @@ ; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 -; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 +; GFX90A-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 ; GFX90A-NEXT: s_sub_u32 s0, 0, s8 ; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 ; GFX90A-NEXT: v_mov_b32_e32 v5, s1 -; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 -; GFX90A-NEXT: v_mul_f32_e32 v3, s18, v2 +; GFX90A-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX90A-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 -; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 +; GFX90A-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX90A-NEXT: s_subb_u32 s1, 0, s9 @@ -13761,10 +13702,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 ; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 -; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc -; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 @@ -13774,44 +13713,41 @@ ; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 -; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 ; GFX6-NEXT: s_sub_u32 s2, 0, s16 ; GFX6-NEXT: s_subb_u32 s3, 0, s17 -; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_ashr_i32 s12, s9, 31 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 +; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s0, s8, s12 +; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v0 -; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX6-NEXT: s_addc_u32 s1, s9, s12 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX6-NEXT: s_addc_u32 s1, s9, s12 -; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc @@ -13820,6 +13756,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 @@ -13886,7 +13823,7 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s9 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 -; GFX6-NEXT: v_mac_f32_e32 v6, s18, v7 +; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 ; GFX6-NEXT: v_rcp_f32_e32 v6, v6 @@ -13896,10 +13833,10 @@ ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v3, s19, v6 -; GFX6-NEXT: v_mul_f32_e32 v4, s20, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v6 +; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mac_f32_e32 v3, s21, v4 +; GFX6-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: s_sub_u32 s0, 0, s8 @@ -14020,9 +13957,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 -; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc -; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -14033,20 +13967,19 @@ ; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 ; GFX9-NEXT: s_sub_u32 s2, 0, s12 ; GFX9-NEXT: s_subb_u32 s3, 0, s13 -; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 +; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX9-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s8, s5, 31 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 +; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 @@ -14056,16 +13989,16 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -14147,17 +14080,17 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v1 -; GFX9-NEXT: v_mac_f32_e32 v5, s16, v6 +; GFX9-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GFX9-NEXT: v_rcp_f32_e32 v5, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v4, s17, v5 -; GFX9-NEXT: v_mul_f32_e32 v5, s18, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v5 +; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mac_f32_e32 v4, s19, v5 +; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: s_sub_u32 s0, 0, s10 @@ -14279,9 +14212,7 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX90A-NEXT: s_mov_b32 s16, 0x4f800000 -; GFX90A-NEXT: s_mov_b32 s17, 0x5f7ffffc -; GFX90A-NEXT: s_mov_b32 s18, 0x2f800000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 ; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 @@ -14292,28 +14223,26 @@ ; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX90A-NEXT: s_sub_u32 s0, 0, s12 -; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 -; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_subb_u32 s1, 0, s13 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 ; GFX90A-NEXT: s_mov_b32 s15, s14 -; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 -; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v1 -; GFX90A-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v5, s0, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX90A-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 @@ -14415,14 +14344,14 @@ ; GFX90A-NEXT: v_xor_b32_e32 v0, s14, v0 ; GFX90A-NEXT: s_sub_u32 s0, 0, s4 ; GFX90A-NEXT: v_xor_b32_e32 v1, s14, v1 -; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 +; GFX90A-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, s14 ; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 -; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 -; GFX90A-NEXT: v_mul_f32_e32 v3, s18, v2 +; GFX90A-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; GFX90A-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 -; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 +; GFX90A-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX90A-NEXT: s_subb_u32 s1, 0, s5 diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -275,12 +275,10 @@ ; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64: ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}} -; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}} -; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}} -; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]] -; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]] -; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]] -; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]] +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, v[[LO0]] +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, v[[HI0]] +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, v[[LO1]] +; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, v[[HI1]] ; SI: buffer_store_dwordx2 ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -518,7 +518,6 @@ ; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 ; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: s_mov_b32 s5, 0xffff0000 ; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8 @@ -529,9 +528,9 @@ ; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 ; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 -; SI-NEXT: v_and_b32_e32 v4, s5, v1 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, s5, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -249,8 +249,6 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s12, 0xff00 -; SI-NEXT: s_movk_i32 s13, 0xff ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s8, s4 ; SI-NEXT: s_mov_b32 s9, s5 @@ -260,12 +258,12 @@ ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 -; SI-NEXT: v_and_b32_e32 v2, s12, v0 -; SI-NEXT: v_and_b32_e32 v4, s12, v1 -; SI-NEXT: v_and_b32_e32 v3, s13, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v1, s13, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -281,34 +279,31 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s12, 0xff00 -; VI-NEXT: s_movk_i32 s13, 0xff -; VI-NEXT: s_movk_i32 s14, 0x900 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_mov_b32 s10, s2 ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v4, s12, v1 +; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 -; VI-NEXT: v_and_b32_e32 v1, s13, v1 -; VI-NEXT: v_and_b32_e32 v2, s12, v0 -; VI-NEXT: v_and_b32_e32 v3, s13, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v1, s14, v1 -; VI-NEXT: v_add_u16_e32 v2, s14, v2 +; VI-NEXT: v_add_u16_e32 v1, 0x900, v1 +; VI-NEXT: v_add_u16_e32 v2, 0x900, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -327,35 +322,33 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[10:11] +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b32 s16, 0xff00 -; SI-NEXT: s_movk_i32 s17, 0xff -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 -; SI-NEXT: v_and_b32_e32 v2, s16, v0 -; SI-NEXT: v_and_b32_e32 v4, s16, v1 -; SI-NEXT: v_and_b32_e32 v3, s17, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v1, s17, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -363,24 +356,21 @@ ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 ; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2_extra_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s16, 0xff00 -; VI-NEXT: s_movk_i32 s17, 0xff -; VI-NEXT: s_movk_i32 s18, 0x900 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_mov_b32 s15, s11 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 @@ -391,16 +381,16 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v4, s16, v1 +; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 -; VI-NEXT: v_and_b32_e32 v1, s17, v1 -; VI-NEXT: v_and_b32_e32 v2, s16, v0 -; VI-NEXT: v_and_b32_e32 v3, s17, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v1, s18, v1 -; VI-NEXT: v_add_u16_e32 v2, s18, v2 +; VI-NEXT: v_add_u16_e32 v1, 0x900, v1 +; VI-NEXT: v_add_u16_e32 v2, 0x900, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -405,12 +405,11 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xffff ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, s0, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v3, s0, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 @@ -435,12 +434,11 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; VI-NEXT: v_and_b32_e32 v1, s0, v1 -; VI-NEXT: v_and_b32_e32 v0, s0, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 @@ -530,16 +528,15 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xffff ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, s0, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v5, s0, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v6, s0, v2 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v7, s0, v3 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 @@ -572,16 +569,15 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s0, v3 -; VI-NEXT: v_and_b32_e32 v2, s0, v2 -; VI-NEXT: v_and_b32_e32 v1, s0, v1 -; VI-NEXT: v_and_b32_e32 v0, s0, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 ; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 ; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 @@ -712,25 +708,24 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 ; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[0:3], 0 addr64 offset:16 -; SI-NEXT: s_mov_b32 s0, 0xffff ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, s0, v0 +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, s0, v4 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v9, s0, v5 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v10, s0, v6 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v11, s0, v7 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v13, s0, v1 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v14, s0, v2 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v15, s0, v3 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 ; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 @@ -783,16 +778,15 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s0, v3 -; VI-NEXT: v_and_b32_e32 v2, s0, v2 -; VI-NEXT: v_and_b32_e32 v1, s0, v1 -; VI-NEXT: v_and_b32_e32 v0, s0, v0 +; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 @@ -802,10 +796,10 @@ ; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0 ; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0 ; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0 -; VI-NEXT: v_and_b32_e32 v7, s0, v7 -; VI-NEXT: v_and_b32_e32 v6, s0, v6 -; VI-NEXT: v_and_b32_e32 v5, s0, v5 -; VI-NEXT: v_and_b32_e32 v4, s0, v4 +; VI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; VI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; VI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -1030,7 +1030,6 @@ ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s0, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[4:5] offset:1 @@ -1041,16 +1040,16 @@ ; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[4:5] offset:6 ; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[4:5] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) -; GFX9-GISEL-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-GISEL-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v4, 8, v4 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) -; GFX9-GISEL-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX9-GISEL-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v6, 8, v6 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX9-GISEL-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX9-GISEL-NEXT: v_lshlrev_b16_e32 v8, 8, v8 ; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -916,7 +916,6 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s0, 0xff ; SI-NEXT: s_mov_b32 s10, s6 ; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -930,11 +929,11 @@ ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, s0, v4 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 ; SI-NEXT: v_or_b32_e32 v0, v7, v0 -; SI-NEXT: v_and_b32_e32 v2, s0, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -136,8 +136,7 @@ } ; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32: -; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x41700000 -; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}} +; GCN: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+$}} ; GCN-NOT: v_mul ; GCN-NOT: v_max ; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] @@ -152,7 +151,7 @@ } ; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32: -; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} ; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000 ; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]] ; GCN-NOT: v_mul diff --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll --- a/llvm/test/CodeGen/AMDGPU/fexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fexp.ll @@ -18,9 +18,8 @@ ; GCN-LABEL: v_exp_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x3fb8aa3b -; GCN-NEXT: v_mul_f32_e32 v0, s4, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: v_exp_f32_e32 v1, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -32,10 +31,9 @@ ; GCN-LABEL: v_exp_v3f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x3fb8aa3b -; GCN-NEXT: v_mul_f32_e32 v0, s4, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: v_exp_f32_e32 v1, v1 ; GCN-NEXT: v_exp_f32_e32 v2, v2 @@ -48,11 +46,10 @@ ; GCN-LABEL: v_exp_v4f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x3fb8aa3b -; GCN-NEXT: v_mul_f32_e32 v0, s4, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 -; GCN-NEXT: v_mul_f32_e32 v3, s4, v3 +; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 ; GCN-NEXT: v_exp_f32_e32 v0, v0 ; GCN-NEXT: v_exp_f32_e32 v1, v1 ; GCN-NEXT: v_exp_f32_e32 v2, v2 @@ -95,11 +92,10 @@ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s4, 0x3fb8aa3b ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, s4, v0 -; SI-NEXT: v_mul_f32_e32 v1, s4, v1 +; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 ; SI-NEXT: v_exp_f32_e32 v0, v0 ; SI-NEXT: v_exp_f32_e32 v1, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -141,15 +137,14 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s4, 0x3fb8aa3b ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, s4, v0 -; SI-NEXT: v_mul_f32_e32 v1, s4, v1 -; SI-NEXT: v_mul_f32_e32 v2, s4, v2 -; SI-NEXT: v_mul_f32_e32 v3, s4, v3 +; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; SI-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 +; SI-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 +; SI-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 ; SI-NEXT: v_exp_f32_e32 v0, v0 ; SI-NEXT: v_exp_f32_e32 v1, v1 ; SI-NEXT: v_exp_f32_e32 v2, v2 @@ -159,11 +154,10 @@ ; VI-LABEL: v_exp_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_movk_i32 s4, 0x3dc5 ; VI-NEXT: v_mov_b32_e32 v3, 0x3dc5 -; VI-NEXT: v_mul_f16_e32 v2, s4, v1 +; VI-NEXT: v_mul_f16_e32 v2, 0x3dc5, v1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mul_f16_e32 v4, s4, v0 +; VI-NEXT: v_mul_f16_e32 v4, 0x3dc5, v0 ; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_exp_f16_e32 v2, v2 ; VI-NEXT: v_exp_f16_e32 v4, v4 @@ -177,9 +171,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_movk_i32 s4, 0x3dc5 -; GFX9-NEXT: v_mul_f16_e32 v2, s4, v1 +; GFX9-NEXT: v_mul_f16_e32 v2, 0x3dc5, v1 ; GFX9-NEXT: v_mul_f16_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_mul_f16_e32 v3, s4, v0 +; GFX9-NEXT: v_mul_f16_e32 v3, 0x3dc5, v0 ; GFX9-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_exp_f16_e32 v2, v2 ; GFX9-NEXT: v_exp_f16_e32 v3, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -411,13 +411,12 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 -; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_add_u32_e32 v1, 4, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -444,16 +443,15 @@ ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_add_u32_e32 v1, 4, v0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 4, v0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm @@ -1117,12 +1115,11 @@ ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -1152,18 +1149,17 @@ ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104 -; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x104, v0 +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm @@ -1886,12 +1882,11 @@ ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 15 -; GFX9-NEXT: scratch_store_dword v2, v3, off +; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: scratch_store_dword v1, v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -1921,18 +1916,17 @@ ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 -; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 -; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -241,9 +241,8 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -369,11 +368,10 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v4 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SAFE-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-SAFE-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX9-SAFE-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-SAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v8, 16, v0 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v10, 16, v1 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v2, v12, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -242,9 +242,8 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -370,11 +369,10 @@ ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v4 ; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SAFE-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-SAFE-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX9-SAFE-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-SAFE-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX9-SAFE-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX9-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-SAFE-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-SAFE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v0, v8, 16, v0 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v1, v10, 16, v1 ; GFX9-SAFE-NEXT: v_lshl_or_b32 v2, v12, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -161,13 +161,12 @@ } ; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32: -; GCN-SAFE-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}} ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[SIGNBIT]], [[A]] +; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[SIGNBIT]], [[ADD]] +; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] ; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir --- a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir @@ -170,9 +170,8 @@ # operands # CHECK-LABEL: name: add_f32_1.0_multi_f16_use -# CHECK: %13:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec -# CHECK: %14:vgpr_32 = V_ADD_F16_e32 killed %11, %13, implicit $mode, implicit $exec -# CHECK: %15:vgpr_32 = V_ADD_F16_e32 killed %12, killed %13, implicit $mode, implicit $exec +# CHECK: %14:vgpr_32 = V_ADD_F16_e32 1065353216, killed %11, implicit $mode, implicit $exec +# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, killed %12, implicit $mode, implicit $exec name: add_f32_1.0_multi_f16_use @@ -306,9 +305,8 @@ # constant, and not folded as a multi-use literal for the f16 cases # CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use -# CHECK: %14:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec -# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %11, %14, implicit $mode, implicit $exec -# CHECK: %16:vgpr_32 = V_ADD_F16_e32 %12, %14, implicit $mode, implicit $exec +# CHECK: %15:vgpr_32 = V_ADD_F16_e32 1065353216, %11, implicit $mode, implicit $exec +# CHECK: %16:vgpr_32 = V_ADD_F16_e32 1065353216, %12, implicit $mode, implicit $exec # CHECK: %17:vgpr_32 = V_ADD_F32_e32 1065353216, killed %13, implicit $mode, implicit $exec name: add_f32_1.0_one_f32_use_multi_f16_use @@ -511,9 +509,8 @@ # constant, and not folded as a multi-use literal for the f16 cases # CHECK-LABEL: name: add_f16_1.0_multi_f32_use -# CHECK: %13:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec -# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $mode, implicit $exec -# CHECK: %15:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec +# CHECK: %14:vgpr_32 = V_ADD_F32_e32 15360, %11, implicit $mode, implicit $exec +# CHECK: %15:vgpr_32 = V_ADD_F32_e32 15360, %12, implicit $mode, implicit $exec name: add_f16_1.0_multi_f32_use alignment: 1 @@ -575,12 +572,10 @@ --- # The low 16-bits are an inline immediate, but the high bits are junk -# FIXME: Should be able to fold this # CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use -# CHECK: %13:vgpr_32 = V_MOV_B32_e32 80886784, implicit $exec -# CHECK: %14:vgpr_32 = V_ADD_F16_e32 %11, %13, implicit $mode, implicit $exec -# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $mode, implicit $exec +# CHECK: %14:vgpr_32 = V_ADD_F16_e32 80886784, %11, implicit $mode, implicit $exec +# CHECK: %15:vgpr_32 = V_ADD_F16_e32 80886784, %12, implicit $mode, implicit $exec name: add_f16_1.0_other_high_bits_multi_f16_use alignment: 1 @@ -641,13 +636,9 @@ ... --- -# FIXME: Should fold inline immediate into f16 and literal use into -# f32 instruction. - # CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32 -# CHECK: %13:vgpr_32 = V_MOV_B32_e32 305413120, implicit $exec -# CHECK: %14:vgpr_32 = V_ADD_F32_e32 %11, %13, implicit $mode, implicit $exec -# CHECK: %15:vgpr_32 = V_ADD_F16_e32 %12, %13, implicit $mode, implicit $exec +# CHECK: %14:vgpr_32 = V_ADD_F32_e32 305413120, %11, implicit $mode, implicit $exec +# CHECK: %15:vgpr_32 = V_ADD_F16_e32 305413120, %12, implicit $mode, implicit $exec name: add_f16_1.0_other_high_bits_use_f16_f32 alignment: 1 exposesReturnsTwice: false diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -242,19 +242,18 @@ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, 0x2f800000 -; SI-NEXT: s_mov_b32 s3, 0xcf800000 +; SI-NEXT: s_mov_b32 s2, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_trunc_f32_e32 v0, s1 ; SI-NEXT: v_trunc_f32_e32 v2, s0 -; SI-NEXT: v_mul_f32_e32 v1, s2, v0 -; SI-NEXT: v_mul_f32_e32 v3, s2, v2 +; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_floor_f32_e32 v4, v1 ; SI-NEXT: v_floor_f32_e32 v5, v3 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v5 -; SI-NEXT: v_fma_f32 v0, v4, s3, v0 -; SI-NEXT: v_fma_f32 v4, v5, s3, v2 +; SI-NEXT: v_fma_f32 v0, v4, s2, v0 +; SI-NEXT: v_fma_f32 v4, v5, s2, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v2, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -263,13 +262,12 @@ ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s4, 0x2f800000 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trunc_f32_e32 v0, s3 ; VI-NEXT: v_trunc_f32_e32 v4, s2 -; VI-NEXT: v_mul_f32_e32 v1, s4, v0 -; VI-NEXT: v_mul_f32_e32 v2, s4, v4 +; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4 ; VI-NEXT: v_floor_f32_e32 v5, v1 ; VI-NEXT: s_mov_b32 s2, 0xcf800000 ; VI-NEXT: v_floor_f32_e32 v6, v2 @@ -379,29 +377,28 @@ ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s8, 0x2f800000 -; SI-NEXT: s_mov_b32 s9, 0xcf800000 +; SI-NEXT: s_mov_b32 s8, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_trunc_f32_e32 v0, s1 ; SI-NEXT: v_trunc_f32_e32 v2, s0 ; SI-NEXT: v_trunc_f32_e32 v4, s3 ; SI-NEXT: v_trunc_f32_e32 v6, s2 -; SI-NEXT: v_mul_f32_e32 v1, s8, v0 -; SI-NEXT: v_mul_f32_e32 v3, s8, v2 -; SI-NEXT: v_mul_f32_e32 v5, s8, v4 -; SI-NEXT: v_mul_f32_e32 v7, s8, v6 +; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; SI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; SI-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 ; SI-NEXT: v_floor_f32_e32 v8, v1 ; SI-NEXT: v_floor_f32_e32 v9, v3 ; SI-NEXT: v_floor_f32_e32 v10, v5 ; SI-NEXT: v_floor_f32_e32 v11, v7 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v9 -; SI-NEXT: v_fma_f32 v0, v8, s9, v0 -; SI-NEXT: v_fma_f32 v8, v9, s9, v2 +; SI-NEXT: v_fma_f32 v0, v8, s8, v0 +; SI-NEXT: v_fma_f32 v8, v9, s8, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v7, v10 ; SI-NEXT: v_cvt_u32_f32_e32 v5, v11 -; SI-NEXT: v_fma_f32 v4, v10, s9, v4 -; SI-NEXT: v_fma_f32 v9, v11, s9, v6 +; SI-NEXT: v_fma_f32 v4, v10, s8, v4 +; SI-NEXT: v_fma_f32 v9, v11, s8, v6 ; SI-NEXT: v_cvt_u32_f32_e32 v2, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v6, v4 @@ -414,13 +411,13 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s6, 0x2f800000 ; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trunc_f32_e32 v0, s1 ; VI-NEXT: v_trunc_f32_e32 v4, s0 -; VI-NEXT: v_mul_f32_e32 v1, s6, v0 -; VI-NEXT: v_mul_f32_e32 v2, s6, v4 +; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4 ; VI-NEXT: v_floor_f32_e32 v5, v1 ; VI-NEXT: s_mov_b32 s0, 0xcf800000 ; VI-NEXT: v_floor_f32_e32 v6, v2 @@ -429,11 +426,11 @@ ; VI-NEXT: v_fma_f32 v0, v6, s0, v4 ; VI-NEXT: v_trunc_f32_e32 v4, s3 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v5 -; VI-NEXT: v_mul_f32_e32 v5, s6, v4 +; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; VI-NEXT: v_trunc_f32_e32 v8, s2 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v6 ; VI-NEXT: v_floor_f32_e32 v6, v5 -; VI-NEXT: v_mul_f32_e32 v5, s6, v8 +; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v8 ; VI-NEXT: v_floor_f32_e32 v9, v5 ; VI-NEXT: v_fma_f32 v4, v6, s0, v4 ; VI-NEXT: v_cvt_u32_f32_e32 v7, v6 @@ -442,7 +439,6 @@ ; VI-NEXT: v_cvt_u32_f32_e32 v5, v9 ; VI-NEXT: v_cvt_u32_f32_e32 v4, v4 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/fpow.ll @@ -479,14 +479,13 @@ ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: s_mov_b32 s4, 0x80008000 -; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_log_f32_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -2619,8 +2619,7 @@ ; SI-NEXT: v_and_b32_e32 v10, v8, v10 ; SI-NEXT: v_not_b32_e32 v11, v11 ; SI-NEXT: v_and_b32_e32 v11, v9, v11 -; SI-NEXT: s_brev_b32 s8, 1 -; SI-NEXT: v_and_b32_e32 v13, s8, v9 +; SI-NEXT: v_and_b32_e32 v13, 0x80000000, v9 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12 ; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12 @@ -2650,7 +2649,7 @@ ; SI-NEXT: v_and_b32_e32 v8, v6, v8 ; SI-NEXT: v_not_b32_e32 v9, v9 ; SI-NEXT: v_and_b32_e32 v9, v7, v9 -; SI-NEXT: v_and_b32_e32 v11, s8, v7 +; SI-NEXT: v_and_b32_e32 v11, 0x80000000, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -713,10 +713,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX9-NEXT: s_mov_b32 s4, 0xf000f ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -753,14 +752,13 @@ ; SI-NEXT: v_or_b32_e32 v4, 16, v6 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4 -; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_or_b32_e32 v3, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v2, s4, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -860,12 +858,11 @@ ; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 ; SI-NEXT: v_or_b32_e32 v4, 16, v10 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, s4, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 @@ -927,10 +924,9 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -24,15 +24,14 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 @@ -44,9 +43,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -56,13 +54,13 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -163,19 +161,18 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s5, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -577,15 +574,14 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 @@ -597,9 +593,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -609,13 +604,13 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -627,16 +622,15 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v2 -; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) @@ -998,13 +992,12 @@ ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1018,7 +1011,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1028,15 +1020,15 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s2, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1134,14 +1126,13 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s4 @@ -1154,9 +1145,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1170,13 +1160,13 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1275,15 +1265,14 @@ ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 @@ -1295,7 +1284,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1305,15 +1293,15 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s3 +; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1418,13 +1406,12 @@ ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] ; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1438,7 +1425,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1448,15 +1434,15 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s3 +; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1560,15 +1546,14 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v2, v0 @@ -1580,9 +1565,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1592,13 +1576,13 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1702,17 +1686,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s5 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1723,9 +1706,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1735,13 +1717,13 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2006,15 +1988,14 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s4 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 @@ -2027,9 +2008,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2039,13 +2019,13 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s3 +; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s2 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2058,20 +2038,19 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v2 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 ; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm @@ -2081,20 +2060,19 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 ; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm @@ -2324,17 +2302,16 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s4 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2345,9 +2322,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2357,13 +2333,13 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -2638,13 +2614,12 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ushort v4, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -188,28 +188,27 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -418,13 +417,12 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 @@ -963,34 +961,33 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v1, v2, 8, 8 ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 ; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v6 ; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v6, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -21,21 +21,20 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, s4 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 @@ -47,9 +46,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -59,15 +57,15 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s3 +; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 @@ -189,13 +187,12 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 @@ -226,17 +223,16 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ushort v1, v[2:3] -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_and_b32_e32 v6, s0, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX8-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX8-NEXT: v_and_b32_sdwa v10, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v7, s0, v0 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v0 -; GFX8-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v1, v6, v7, v1 ; GFX8-NEXT: v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -259,13 +255,13 @@ ; GFX9-NODL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) -; GFX9-NODL-NEXT: v_and_b32_e32 v0, s0, v2 +; GFX9-NODL-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v5, s0, v3 +; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v3 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v6, s0, v6 -; GFX9-NODL-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX9-NODL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -290,13 +286,13 @@ ; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v0, s0, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v0, 0xff, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xff, v3 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v6, s0, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 ; GFX9-DL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -401,13 +397,12 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v8 @@ -584,12 +579,11 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v4 @@ -725,13 +719,12 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v1, v8 @@ -908,14 +901,13 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v3, v6, v3, v8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 @@ -1089,22 +1081,21 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v8, v1, v5, s4 ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 @@ -1116,9 +1107,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1128,15 +1118,15 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s3 +; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s2 ; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 @@ -1153,22 +1143,21 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_and_b32_e32 v4, s0, v2 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] @@ -1179,22 +1168,21 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 ; GFX9-DL-NEXT: v_add3_u32 v2, v5, v3, v2 ; GFX9-DL-NEXT: v_add3_u32 v1, v2, v6, v1 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] @@ -1284,24 +1272,23 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s5 +; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, s4 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 -; GFX7-NEXT: v_add_i32_e32 v6, vcc, s5, v3 +; GFX7-NEXT: v_add_i32_e32 v6, vcc, s4, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1312,9 +1299,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1325,19 +1311,19 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8 -; GFX8-NEXT: v_and_b32_e32 v1, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 8 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX8-NEXT: v_and_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s3 +; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s2 ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s3, v4 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v4 ; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1481,15 +1467,14 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 @@ -1507,7 +1492,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1523,13 +1507,13 @@ ; GFX8-NEXT: flat_load_ushort v10, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 -; GFX8-NEXT: v_and_b32_e32 v7, s2, v7 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX8-NEXT: v_bfe_i32 v1, v4, 0, 8 ; GFX8-NEXT: v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 -; GFX8-NEXT: v_and_b32_e32 v8, s2, v8 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX8-NEXT: v_bfe_i32 v6, v0, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v7, v7, v8, v10 @@ -1556,8 +1540,8 @@ ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-NODL-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX9-NODL-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX9-NODL-NEXT: v_bfe_i32 v0, v2, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v4, v3, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) @@ -1587,8 +1571,8 @@ ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v3 -; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX9-DL-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX9-DL-NEXT: v_bfe_i32 v0, v2, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v4, v3, 0, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) @@ -1694,19 +1678,18 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 @@ -1720,9 +1703,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -1735,14 +1717,14 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3 ; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, 8, v3 -; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0 -; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s3 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2 ; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0 ; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0 ; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 @@ -1864,22 +1846,20 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v8, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xff00 -; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff00, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v5, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff00, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v6, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s5, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v8 ; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 @@ -1897,7 +1877,6 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -1915,12 +1894,12 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v6, 8, v4 ; GFX8-NEXT: v_and_b32_sdwa v9, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v8, 8, v0 ; GFX8-NEXT: v_and_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v0, v4, v0, v10 ; GFX8-NEXT: v_mad_u16 v0, v6, v8, v0 @@ -1953,14 +1932,14 @@ ; GFX9-NODL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 ; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 -; GFX9-NODL-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX9-NODL-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX9-NODL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v8, 16, v10 -; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 +; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-NODL-NEXT: v_lshl_or_b32 v5, v6, 16, v9 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 -; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 +; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v5, v4 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -1991,14 +1970,14 @@ ; GFX9-DL-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2 ; GFX9-DL-NEXT: v_lshl_or_b32 v1, v5, 16, v1 -; GFX9-DL-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX9-DL-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v8, 16, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v9 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v5, v4 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2091,13 +2070,12 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v3, v3, v6, v8 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -344,49 +344,48 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 ; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v16 ; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 ; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 ; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 ; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v14, s4, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v15, s4, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -917,49 +916,48 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 4 ; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v9, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX7-NEXT: v_bfe_i32 v10, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX7-NEXT: v_bfe_i32 v11, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v1, v9, v16 ; GFX7-NEXT: v_bfe_i32 v5, v2, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-NEXT: v_bfe_i32 v12, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 ; GFX7-NEXT: v_bfe_i32 v6, v2, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 ; GFX7-NEXT: v_bfe_i32 v7, v2, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX7-NEXT: v_bfe_i32 v14, v0, 20, 4 -; GFX7-NEXT: v_and_b32_e32 v13, s4, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 ; GFX7-NEXT: v_bfe_i32 v8, v2, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX7-NEXT: v_bfe_i32 v15, v0, 24, 4 -; GFX7-NEXT: v_and_b32_e32 v14, s4, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 28, v2 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v15, s4, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 @@ -2204,7 +2202,6 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v1, v2, 20, 4 @@ -2212,9 +2209,9 @@ ; GFX7-NEXT: v_bfe_i32 v4, v2, 4, 4 ; GFX7-NEXT: v_bfe_i32 v5, v2, 0, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v9, v0, 20, 4 ; GFX7-NEXT: v_bfe_i32 v10, v0, 16, 4 @@ -2223,19 +2220,19 @@ ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v5, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v10 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v12 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v12 ; GFX7-NEXT: v_bfe_i32 v13, v0, 24, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX7-NEXT: v_or_b32_e32 v5, v10, v9 -; GFX7-NEXT: v_and_b32_e32 v11, s4, v13 -; GFX7-NEXT: v_and_b32_e32 v13, s4, v15 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v15 ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_bfe_i32 v7, v2, 8, 4 ; GFX7-NEXT: v_bfe_i32 v14, v0, 8, 4 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2243,22 +2240,22 @@ ; GFX7-NEXT: v_bfe_i32 v6, v2, 24, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2 ; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v14 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v14 ; GFX7-NEXT: v_mad_u32_u24 v3, v15, v10, v3 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_mad_u32_u24 v3, v7, v12, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v3 ; GFX7-NEXT: v_mad_u32_u24 v0, v1, v4, v0 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: v_mad_u32_u24 v0, v14, v9, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v11, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v13, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -2354,74 +2351,73 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NEXT: global_load_dword v4, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v3, 12, 4 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX9-NEXT: v_bfe_u32 v0, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v6, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v8, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX9-NEXT: v_bfe_u32 v10, v3, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v3 +; GFX9-NEXT: v_bfe_u32 v12, v3, 16, 4 +; GFX9-NEXT: v_bfe_u32 v13, v3, 20, 4 +; GFX9-NEXT: v_bfe_u32 v14, v3, 8, 4 +; GFX9-NEXT: v_bfe_u32 v15, v3, 12, 4 ; GFX9-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v10, v2, v10 -; GFX9-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v17 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v9 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v16 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v15, 16, v14 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v13, 16, v12 +; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX9-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v16, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v11 -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v2, v12, 16, v2 -; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v8, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v7, v4 +; GFX9-NEXT: v_lshl_or_b32 v8, v11, 16, v10 +; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v6 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v6, v7 +; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v4, v3, v4 -; GFX9-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u16_e32 v3, v2, v3 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v2, v2, v4 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v2, v2, v5 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v0 ; GFX9-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v1, v0, s[2:3] @@ -2437,74 +2433,73 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v4, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-DL-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v3, 12, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX9-DL-NEXT: v_bfe_u32 v0, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v3, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v3 +; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 12, 4 ; GFX9-DL-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, v2, v10 -; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v17 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9 +; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v16 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, v3 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX9-DL-NEXT: global_load_ushort v3, v1, s[2:3] +; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX9-DL-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v15, 16, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v13, 16, v12 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-DL-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v16, 16, v15 -; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v11 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v12, 16, v2 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v8, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v7, v4 +; GFX9-DL-NEXT: v_lshl_or_b32 v8, v11, 16, v10 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v6, v7 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v4, v3, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v2, v3 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v5 +; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v0 ; GFX9-DL-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] @@ -2811,20 +2806,18 @@ ; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_mov_b32 s5, 0xffff ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 28, v2 ; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 ; GFX7-NEXT: v_bfe_i32 v16, v0, 4, 4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v9 -; GFX7-NEXT: v_and_b32_e32 v9, s4, v13 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v16 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 ; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 @@ -2834,11 +2827,11 @@ ; GFX7-NEXT: v_bfe_i32 v8, v2, 4, 4 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0 ; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4 ; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4 @@ -2849,28 +2842,28 @@ ; GFX7-NEXT: v_or_b32_e32 v5, v7, v6 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v10 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12 ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v14 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v15 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v15 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX7-NEXT: v_or_b32_e32 v7, v9, v8 ; GFX7-NEXT: v_or_b32_e32 v8, v12, v11 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v13 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX7-NEXT: v_and_b32_e32 v6, s4, v2 -; GFX7-NEXT: v_and_b32_e32 v12, s4, v0 -; GFX7-NEXT: v_and_b32_e32 v5, s5, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v7 ; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 8 ; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 @@ -2878,14 +2871,14 @@ ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v8, s4, v3 -; GFX7-NEXT: v_and_b32_e32 v14, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v3 +; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v4 ; GFX7-NEXT: v_bfe_u32 v9, v3, 8, 8 ; GFX7-NEXT: v_bfe_u32 v15, v4, 8, 8 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v6, v6, v12, v16 ; GFX7-NEXT: v_mad_u32_u24 v6, v7, v13, v6 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2119,7 +2119,6 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v16, off, s[0:3], 0 -; GFX7-NEXT: s_mov_b32 s4, 0xf0000 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_u32 v7, v2, 20, 4 @@ -2130,12 +2129,12 @@ ; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v6, 15, v2 ; GFX7-NEXT: v_alignbit_b32 v2, v7, v2, 16 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v7, 0xf0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 12, v0 ; GFX7-NEXT: v_and_b32_e32 v13, 15, v0 ; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v7, 0xf0000, v8 ; GFX7-NEXT: v_or_b32_e32 v7, v13, v7 ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 15, v6 @@ -2226,58 +2225,57 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-NEXT: global_load_dword v4, v0, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v10, 15, v3 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-NEXT: v_bfe_u32 v9, v3, 12, 4 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX9-NEXT: v_bfe_u32 v0, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v6, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v8, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX9-NEXT: v_bfe_u32 v10, v3, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v3 +; GFX9-NEXT: v_bfe_u32 v12, v3, 16, 4 +; GFX9-NEXT: v_bfe_u32 v13, v3, 20, 4 +; GFX9-NEXT: v_bfe_u32 v14, v3, 8, 4 +; GFX9-NEXT: v_bfe_u32 v15, v3, 12, 4 ; GFX9-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v11, v2, v11 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v10 -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v17 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v4 -; GFX9-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v5, v14, 16, v13 -; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v15 -; GFX9-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, v6, v5 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v8, v7 -; GFX9-NEXT: v_lshl_or_b32 v10, v12, 16, v11 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v16 +; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v9 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX9-NEXT: global_load_ushort v3, v1, s[2:3] +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v13, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-NEXT: v_lshl_or_b32 v6, v15, 16, v14 +; GFX9-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v7, v6 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-NEXT: v_lshl_or_b32 v10, v11, 16, v10 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v4, v2, v4 -; GFX9-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v2, v3 +; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v5 ; GFX9-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u16_e32 v2, v2, v3 -; GFX9-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v2, v2, v4 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v0 ; GFX9-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v1, v0, s[2:3] @@ -2293,58 +2291,57 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5] -; GFX9-DL-NEXT: global_load_dword v4, v0, s[6:7] ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7] ; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v0, v3, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v3, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_u32 v11, v4, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v4, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v4, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX9-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v3, 12, 4 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v3 +; GFX9-DL-NEXT: v_bfe_u32 v0, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v3, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v3 +; GFX9-DL-NEXT: v_bfe_u32 v12, v3, 16, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v3, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v14, v3, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v15, v3, 12, 4 ; GFX9-DL-NEXT: v_bfe_u32 v3, v3, 4, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v4, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v4, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v4, v4, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v11, v2, v11 -; GFX9-DL-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-DL-NEXT: v_and_b32_e32 v13, v2, v13 -; GFX9-DL-NEXT: v_and_b32_e32 v6, v2, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v15, v2, v15 -; GFX9-DL-NEXT: v_and_b32_e32 v8, v2, v8 -; GFX9-DL-NEXT: v_and_b32_e32 v17, v2, v17 -; GFX9-DL-NEXT: v_and_b32_e32 v2, v2, v10 -; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v17 -; GFX9-DL-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4 -; GFX9-DL-NEXT: global_load_ushort v4, v1, s[2:3] -; GFX9-DL-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-DL-NEXT: v_lshl_or_b32 v5, v14, 16, v13 -; GFX9-DL-NEXT: v_lshl_or_b32 v6, v7, 16, v6 -; GFX9-DL-NEXT: v_lshl_or_b32 v7, v16, 16, v15 -; GFX9-DL-NEXT: v_lshl_or_b32 v8, v9, 16, v8 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v6, v5 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v8, v7 -; GFX9-DL-NEXT: v_lshl_or_b32 v10, v12, 16, v11 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v16 +; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX9-DL-NEXT: global_load_ushort v3, v1, s[2:3] +; GFX9-DL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX9-DL-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-DL-NEXT: v_lshl_or_b32 v4, v13, 16, v12 +; GFX9-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX9-DL-NEXT: v_lshl_or_b32 v6, v15, 16, v14 +; GFX9-DL-NEXT: v_lshl_or_b32 v7, v8, 16, v7 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v7, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX9-DL-NEXT: v_lshl_or_b32 v10, v11, 16, v10 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v0, v0, v10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v4, v2, v4 -; GFX9-DL-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v2, v3 +; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v5 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_add_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v0 ; GFX9-DL-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v1, v0, s[2:3] @@ -2479,8 +2476,6 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ubyte v16, off, s[0:3], 0 -; GFX7-NEXT: s_movk_i32 s4, 0xf00 -; GFX7-NEXT: s_movk_i32 s5, 0xf0f ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 4, v2 @@ -2492,27 +2487,27 @@ ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 4, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 28, v0 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xf00, v7 +; GFX7-NEXT: v_and_b32_e32 v3, 0xf00, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 15, v2 ; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 4 ; GFX7-NEXT: v_and_b32_e32 v11, 15, v0 ; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 12, v0 ; GFX7-NEXT: v_alignbit_b32 v2, v5, v2, 24 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v5, 0xf00, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 4, v0 ; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX7-NEXT: v_alignbit_b32 v0, v12, v0, 24 -; GFX7-NEXT: v_and_b32_e32 v7, s4, v10 +; GFX7-NEXT: v_and_b32_e32 v7, 0xf00, v10 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_and_b32_e32 v3, s4, v14 -; GFX7-NEXT: v_and_b32_e32 v5, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v3, 0xf00, v14 +; GFX7-NEXT: v_and_b32_e32 v5, 0xf00, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xf0f, v0 ; GFX7-NEXT: v_or_b32_e32 v7, v9, v7 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xf0f, v2 ; GFX7-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1304,13 +1304,12 @@ ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_movk_i32 s5, 0xff -; SI-NEXT: s_lshr_b32 s6, s11, 8 +; SI-NEXT: s_lshr_b32 s5, s11, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_cmp_lg_u32 s4, 13 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 12 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc @@ -1318,28 +1317,27 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v2, s5, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_mov_b32 s6, 0xffff -; SI-NEXT: s_lshr_b32 s7, s10, 24 +; SI-NEXT: s_lshr_b32 s5, s10, 24 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_cmp_lg_u32 s4, 11 ; SI-NEXT: v_or_b32_e32 v3, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s7, s10, 16 +; SI-NEXT: s_lshr_b32 s5, s10, 16 ; SI-NEXT: s_cmp_lg_u32 s4, 10 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_lshr_b32 s7, s10, 8 +; SI-NEXT: s_lshr_b32 s5, s10, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_cmp_lg_u32 s4, 9 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 8 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc @@ -1347,27 +1345,27 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v2, s5, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_lshr_b32 s7, s9, 24 +; SI-NEXT: s_lshr_b32 s5, s9, 24 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_cmp_lg_u32 s4, 7 ; SI-NEXT: v_or_b32_e32 v2, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s7, s9, 16 +; SI-NEXT: s_lshr_b32 s5, s9, 16 ; SI-NEXT: s_cmp_lg_u32 s4, 6 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc -; SI-NEXT: s_lshr_b32 s7, s9, 8 +; SI-NEXT: s_lshr_b32 s5, s9, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_cmp_lg_u32 s4, 5 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 4 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc @@ -1375,27 +1373,27 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_and_b32_e32 v4, s5, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: s_lshr_b32 s7, s8, 24 +; SI-NEXT: s_lshr_b32 s5, s8, 24 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v1, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_cmp_lg_u32 s4, 3 ; SI-NEXT: v_or_b32_e32 v1, v1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_lshr_b32 s7, s8, 16 +; SI-NEXT: s_lshr_b32 s5, s8, 16 ; SI-NEXT: s_cmp_lg_u32 s4, 2 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc -; SI-NEXT: s_lshr_b32 s7, s8, 8 +; SI-NEXT: s_lshr_b32 s5, s8, 8 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: v_and_b32_e32 v4, s5, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: s_cmp_lg_u32 s4, 1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc @@ -1403,10 +1401,10 @@ ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_and_b32_e32 v5, s5, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, s6, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -553,10 +553,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_d_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-NEXT: v_and_b32_e32 v4, v6, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX9-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -585,13 +584,12 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v12, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v11, v7, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-NEXT: v_and_b32_e32 v5, v2, v6 -; GFX9-NEXT: v_and_b32_e32 v3, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_lshl_or_b32 v11, v7, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: image_sample_d v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -636,13 +634,12 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_c_d_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v9, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v1, v9, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 ; GFX9-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 @@ -689,10 +686,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_d_cl_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX9-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX9-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 @@ -742,12 +738,11 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: v_and_b32_e32 v5, v0, v5 -; GFX9-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 ; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -789,10 +784,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_cd_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX9-NEXT: v_and_b32_e32 v4, v6, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX9-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -836,13 +830,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t) { ; GFX9-LABEL: sample_c_cd_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v9, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v1, v9, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 ; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 @@ -889,10 +882,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp) { ; GFX9-LABEL: sample_cd_cl_2d: ; GFX9: ; %bb.0: ; %main_body -; GFX9-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX9-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX9-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX9-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v5, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 @@ -942,12 +934,11 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: v_and_b32_e32 v5, v0, v5 -; GFX9-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v5 -; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GFX9-NEXT: v_lshl_or_b32 v10, v6, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 ; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[7:11], s[0:7], s[8:11] dmask:0xf a16 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1136,13 +1127,12 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v13, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v6 -; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 ; GFX9-NEXT: image_sample_c_d_o v0, v[8:13], s[0:7], s[8:11] dmask:0x4 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1172,13 +1162,12 @@ ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: v_mov_b32_e32 v13, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v6 -; GFX9-NEXT: v_lshl_or_b32 v12, v7, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX9-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 ; GFX9-NEXT: image_sample_c_d_o v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 a16 da ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -84,19 +84,18 @@ ; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, 0x3e22f983 +; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x3e22f983, v1 ; GFX6-NEXT: v_fract_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_cos_f32_e32 v0, v0 ; GFX6-NEXT: v_cos_f32_e32 v1, v1 -; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll @@ -32,25 +32,22 @@ ; SI: buffer_load_dword v[[A_F16_0:[0-9]+]] ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] -; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218 -; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c ; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x398c ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] -; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], 0x3f317218, v[[R_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]] -; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]] +; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], 0x3f317218, v[[R_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]] ; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]] ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] ; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] -; VI: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]] -; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] +; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x398c, v[[R_F16_2]] +; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -32,10 +32,9 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %res = call <2 x float> @llvm.log.v2f32(<2 x float> %in) @@ -67,14 +66,13 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3f317218 ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f317218, v{{[0-9]+}} define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %res = call <4 x float> @llvm.log.v4f32(<4 x float> %in) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll @@ -32,25 +32,23 @@ ; SI: buffer_load_dword v[[A_F16_0:[0-9]+]] ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] -; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a -; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1 ; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x34d1 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] ; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] -; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], [[A_F32_2]], v[[R_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_5:[0-9]+]], 0x3e9a209a, v[[R_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_5]] -; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], [[A_F32_2]], v[[R_F32_1]] +; SI: v_mul_f32_e32 v[[R_F32_6:[0-9]+]], 0x3e9a209a, v[[R_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_6]] ; GFX9: v_log_f16_e32 v[[R_F16_2:[0-9]+]], v[[A_F16_0]] ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] ; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_0]] -; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] -; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] +; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_2]] +; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x34d1, v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -32,10 +32,9 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3e9a209a ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { entry: %res = call <2 x float> @llvm.log10.v2f32(<2 x float> %in) @@ -67,14 +66,13 @@ ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: s_mov_b32 [[R_F32_LOG_CONST:s[0-9]+]], 0x3e9a209a ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DAG: v_log_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, [[R_F32_LOG_CONST]], v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3e9a209a, v{{[0-9]+}} define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { entry: %res = call <4 x float> @llvm.log10.v4f32(<4 x float> %in) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -84,19 +84,18 @@ ; GFX6-NEXT: s_mov_b32 s8, s6 ; GFX6-NEXT: s_mov_b32 s9, s7 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_mov_b32 s0, 0x3e22f983 +; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_mov_b32 s1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x3e22f983, v1 ; GFX6-NEXT: v_fract_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 ; GFX6-NEXT: v_fract_f32_e32 v0, v0 ; GFX6-NEXT: v_sin_f32_e32 v0, v0 ; GFX6-NEXT: v_sin_f32_e32 v1, v1 -; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1347,13 +1347,12 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s2, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -1365,13 +1364,12 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx2 v[3:4], v[0:1] -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s2, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s2, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GCN-HSA-NEXT: flat_store_dwordx3 v[5:6], v[0:2] ; GCN-HSA-NEXT: s_endpgm ; @@ -1386,13 +1384,12 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s6, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -1577,14 +1574,13 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s2, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -1595,14 +1591,13 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx2 v[4:5], v[0:1] -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s2, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s2, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -1617,14 +1612,13 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s6, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -1828,7 +1822,6 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) @@ -1836,10 +1829,10 @@ ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s2, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s2, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s2, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s2, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -1847,7 +1840,6 @@ ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1861,12 +1853,12 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s4, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s4, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: s_endpgm @@ -1882,18 +1874,17 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s6, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s6, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s6, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -2132,7 +2123,6 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 @@ -2144,16 +2134,16 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s6, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s6, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s6, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s6, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s6, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s6, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s6, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s6, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -2179,7 +2169,6 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 @@ -2190,20 +2179,20 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s4, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s4, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[7:10] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s4, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s4, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s4, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] @@ -2223,27 +2212,26 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s6, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s6, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s6, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s6, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s6, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s6, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s6, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 @@ -2612,7 +2600,6 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -2630,34 +2617,34 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, s0, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v0 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, s0, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, s0, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, s0, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v8 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s0, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, s0, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xffff, v12 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 @@ -2673,7 +2660,6 @@ ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s14, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2710,8 +2696,8 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] @@ -2719,16 +2705,16 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xffff, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] @@ -2736,32 +2722,32 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v12 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v14 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s14, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s14, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v13, s14, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s14, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] @@ -2782,45 +2768,42 @@ ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s0, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v2 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s0, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s0, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s0, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v4 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, s0, v11 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v11 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v10 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v10 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s0, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v9 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v8 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s0, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s0, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v12 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 @@ -3483,7 +3466,6 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v15 @@ -3493,16 +3475,16 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, s0, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xffff, v13 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, s0, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill @@ -3510,57 +3492,57 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v19 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v19 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v18 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v18 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v9, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s0, v17 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s0, v16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v16 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v26 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v24 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v27 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v26 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v25 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, s0, v24 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v27 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v26 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v31 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, s0, v30 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v31 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v30 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, s0, v29 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, s0, v28 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v33, 0xffff, v29 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v28 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v46, 16, v38 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v35 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, s0, v38 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, s0, v37 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, s0, v36 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, s0, v35 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xffff, v38 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v43, 0xffff, v37 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xffff, v36 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v47, 0xffff, v35 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v38, 16, v42 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v41 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v54, 16, v40 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v39 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, s0, v42 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, s0, v41 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v37, 0xffff, v42 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v35, 0xffff, v41 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[55:58], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, s0, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, s0, v39 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xffff, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v51, 0xffff, v39 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[39:42], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v58 @@ -3568,19 +3550,19 @@ ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v56 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v55 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, s0, v58 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, s0, v57 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s0, v56 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v55 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xffff, v58 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v57 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v56 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v55 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v41 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v40 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v39 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v42 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v41 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v40 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v39 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xffff, v42 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v41 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v40 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 @@ -3621,7 +3603,6 @@ ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s14, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -3629,13 +3610,13 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -3674,118 +3655,117 @@ ; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s14, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s14, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xb0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s14, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s14, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v24, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v35 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v33 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v32 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v35 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v34 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v35 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v34 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v35 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v34 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v28 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v28 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v29 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v30 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v31 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v31 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v30 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v22 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v12 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v14, s14, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s14, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 @@ -3796,8 +3776,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s14, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s14, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v18 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 @@ -3823,99 +3803,98 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:96 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[4:7], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[4:7], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(7) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, s4, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, s4, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 16, v15 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v62, 0xffff, v15 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xffff, v14 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s4, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s4, v4 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s4, v23 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s4, v22 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s4, v21 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s4, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v27 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, s4, v27 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v26 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s4, v26 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, s4, v25 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s4, v24 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s4, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v30 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, s4, v30 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v30 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v30 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s4, v29 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, 0xffff, v29 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s4, v28 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v35 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, s4, v35 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, s4, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v35 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xffff, v34 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, s4, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v33 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s4, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, s4, v19 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s4, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v39 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, 0xffff, v39 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xffff, v38 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, 0xffff, v27 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v26 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v25 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v23 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xffff, v22 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v20 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v19 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, 0xffff, v19 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xffff, v18 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s4, v17 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v17 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s4, v16 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s4, v15 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, s4, v14 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s4, v13 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s4, v12 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, s4, v11 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, s4, v10 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s4, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s4, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:192 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5750,7 +5729,6 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1 @@ -5759,8 +5737,8 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s2, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -5768,7 +5746,6 @@ ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5785,9 +5762,9 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s4, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s4, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: s_endpgm @@ -5803,7 +5780,6 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 @@ -5811,9 +5787,9 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v9 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -6054,15 +6030,14 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { ; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, 0xffff -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0 @@ -6072,30 +6047,30 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s12, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s12, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s12, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s12, v3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v12 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6110,24 +6085,23 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s4, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v12 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s4, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s4, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] @@ -6145,7 +6119,6 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 @@ -6157,13 +6130,13 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v17 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s6, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s6, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s6, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 @@ -6513,27 +6486,26 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s0, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, s0, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v6 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, s0, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v20 @@ -6566,10 +6538,10 @@ ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6587,35 +6559,34 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v1 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v3 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[7:10] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 @@ -6629,11 +6600,11 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s6, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s6, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 @@ -6655,9 +6626,9 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v28 @@ -6669,27 +6640,26 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, s0, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, s0, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v4 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v28 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v28 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96 @@ -7289,7 +7259,6 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 @@ -7304,48 +7273,48 @@ ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, s0, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v4 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, s0, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v5 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v8 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, s0, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v7 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, s0, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v9 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, s0, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v10 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v12 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, s0, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v11 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, s0, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v13 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, s0, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v14 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v16 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v16 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v1 @@ -7427,7 +7396,6 @@ ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s16, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 @@ -7470,52 +7438,52 @@ ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v13 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v11 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v19 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v17 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v15 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_and_b32_e32 v0, s16, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[2:5] ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v18 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 @@ -7523,7 +7491,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 @@ -7532,7 +7500,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xa0 @@ -7541,7 +7509,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[2:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s4 @@ -7554,12 +7522,12 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_and_b32_e32 v13, s16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[13:16] ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 @@ -7567,7 +7535,7 @@ ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 @@ -7590,39 +7558,27 @@ ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, s0, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v33 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s0, v36 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v36 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s0, v38 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v38 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v37 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, s0, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, s0, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, s0, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, s0, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, s0, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s0, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s0, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, s0, v33 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, s0, v35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v37 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v35 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v37 @@ -7634,7 +7590,9 @@ ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v7 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v31 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v37 @@ -7642,9 +7600,15 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v37 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v8 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v34 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v37 @@ -7655,10 +7619,13 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v37 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v6 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144 @@ -7667,6 +7634,7 @@ ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v37 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -502,13 +502,12 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 -; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; CI-NEXT: v_and_b32_e32 v3, s0, v3 +; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 @@ -593,13 +592,12 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s0, 0xff00ff ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CI-NEXT: v_and_b32_e32 v3, s0, v3 -; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_and_b32_e32 v3, 0xff00ff, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll --- a/llvm/test/CodeGen/AMDGPU/madmk.ll +++ b/llvm/test/CodeGen/AMDGPU/madmk.ll @@ -31,9 +31,8 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000 -; GCN-DAG: v_mac_f32_e32 [[VB]], [[SK]], [[VA]] -; GCN-DAG: v_mac_f32_e32 [[VC]], [[SK]], [[VA]] +; GCN-DAG: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]] +; GCN-DAG: v_mac_f32_e32 [[VC]], 0x41200000, [[VA]] ; GCN: s_endpgm define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/mul.i16.ll b/llvm/test/CodeGen/AMDGPU/mul.i16.ll --- a/llvm/test/CodeGen/AMDGPU/mul.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.i16.ll @@ -3,9 +3,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}v_mul_i16: -; SI: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} -; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]] -; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]] +; SI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; SI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} ; SI: v_mul_u32_u24 ; GFX89: v_mul_lo_u16_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -838,13 +838,12 @@ ; GCN-LABEL: test_umul24_anyextend_i23_src0_src1: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, 0x7fffff -; GCN-NEXT: v_and_b32_e32 v0, s4, v0 -; GCN-NEXT: v_and_b32_e32 v1, s4, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 0xea, v0 ; GCN-NEXT: v_mul_u32_u24_e32 v1, 0x39b, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0x7ffffe, v0 -; GCN-NEXT: v_and_b32_e32 v1, s4, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 ; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0x1fffe, v0 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x63, v0 diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll --- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll @@ -89,11 +89,10 @@ ret void } ; CHECK-LABEL: {{^}}vector_imm: -; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64 -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, 0x64, v{{[0-9]}} define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) #1 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -50,8 +50,8 @@ } ; GCN-LABEL: {{^}}fadd_v2_v_imm: -; GCN: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 -; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}} +; GFX90A: s_mov_b32 s[[K:[0-9]+]], +; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} ; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @fadd_v2_v_imm(<2 x float> addrspace(1)* %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -261,8 +261,8 @@ } ; GCN-LABEL: {{^}}fmul_v2_v_imm: -; GCN: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 -; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}} +; GFX90A: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 +; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} ; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} define amdgpu_kernel void @fmul_v2_v_imm(<2 x float> addrspace(1)* %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -552,8 +552,7 @@ } ; GCN-LABEL: {{^}}fneg_v2f32_vec: -; GFX900: s_brev_b32 [[SIGN:s[0-9]+]], 1 -; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, [[SIGN]], v{{[0-9]+}} +; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} ; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}} define amdgpu_kernel void @fneg_v2f32_vec(<2 x float> addrspace(1)* %a) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll --- a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll @@ -4,7 +4,7 @@ ; SI-LABEL: {{^}}s_movk_i32_k0: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -18,7 +18,7 @@ ; SI-LABEL: {{^}}s_movk_i32_k1: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x7fff, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -32,7 +32,7 @@ ; SI-LABEL: {{^}}s_movk_i32_k2: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x7fff, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -46,7 +46,7 @@ ; SI-LABEL: {{^}}s_movk_i32_k3: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x8000, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -60,7 +60,7 @@ ; SI-LABEL: {{^}}s_movk_i32_k4: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x20000, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -75,8 +75,8 @@ ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}} ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffffef, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xff00ffff, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -89,7 +89,7 @@ ; SI-LABEL: {{^}}s_movk_i32_k6: ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x41, v[[LO_VREG]] ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { @@ -104,8 +104,8 @@ ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}} ; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x2000, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x4000, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -119,8 +119,8 @@ ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8000, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -134,8 +134,8 @@ ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}} ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8001, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -149,8 +149,8 @@ ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}} ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8888, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -164,8 +164,8 @@ ; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}} ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8fff, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 @@ -179,8 +179,8 @@ ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}} ; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff7001, v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]] ; SI: s_endpgm define amdgpu_kernel void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { %loada = load i64, i64 addrspace(1)* %a, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -134,13 +134,11 @@ ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -156,15 +154,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -198,18 +195,16 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 +; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -224,22 +219,21 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 @@ -273,29 +267,26 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -308,15 +299,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -327,14 +317,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 ; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -368,15 +358,14 @@ ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3 ; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -388,15 +377,14 @@ ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: s_brev_b32 s6, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3 ; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -401,7 +401,6 @@ ; GCN-NEXT: s_mov_b32 s8, s2 ; GCN-NEXT: s_mov_b32 s9, s3 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -421,8 +420,8 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 0, v3 -; GCN-NEXT: v_mul_f32_e32 v5, s2, v5 -; GCN-NEXT: v_mul_f32_e32 v7, s2, v7 +; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; GCN-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 @@ -475,7 +474,6 @@ ; TONGA-NEXT: s_mov_b32 s8, s2 ; TONGA-NEXT: s_mov_b32 s9, s3 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s2, 0x4f7ffffe ; TONGA-NEXT: s_mov_b32 s4, s0 ; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) @@ -495,8 +493,8 @@ ; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v3 -; TONGA-NEXT: v_mul_f32_e32 v5, s2, v5 -; TONGA-NEXT: v_mul_f32_e32 v7, s2, v7 +; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; TONGA-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 ; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v7 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 @@ -549,7 +547,6 @@ ; GFX9-NEXT: s_mov_b32 s8, s2 ; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -567,8 +564,8 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v1 -; GFX9-NEXT: v_mul_f32_e32 v6, s2, v6 -; GFX9-NEXT: v_mul_f32_e32 v7, s2, v7 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v8 @@ -801,130 +798,129 @@ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: s_mov_b32 s6, s10 -; GCN-NEXT: s_mov_b32 s7, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s4, s2 -; GCN-NEXT: s_mov_b32 s5, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GCN-NEXT: s_mov_b32 s8, s0 ; GCN-NEXT: s_mov_b32 s9, s1 +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: s_mov_b32 s1, s3 +; GCN-NEXT: s_mov_b32 s2, s10 +; GCN-NEXT: s_mov_b32 s3, s11 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 -; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 -; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 -; GCN-NEXT: v_cvt_f32_u32_e32 v9, v5 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 +; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 ; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 +; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 ; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GCN-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GCN-NEXT: v_mul_lo_u32 v9, v9, v8 +; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 ; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 -; GCN-NEXT: v_mul_f32_e32 v9, s2, v9 ; GCN-NEXT: v_xor_b32_e32 v16, v10, v11 +; GCN-NEXT: v_mul_hi_u32 v9, v8, v9 +; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 +; GCN-NEXT: v_cvt_f32_u32_e32 v10, v5 ; GCN-NEXT: v_cvt_f32_u32_e32 v11, v6 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 ; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 +; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v10 +; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v11 +; GCN-NEXT: v_mul_lo_u32 v11, v8, v4 +; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 ; GCN-NEXT: v_xor_b32_e32 v17, v12, v13 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 -; GCN-NEXT: v_mul_f32_e32 v8, s2, v8 ; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GCN-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 +; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v9 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v8 -; GCN-NEXT: v_mul_hi_u32 v12, v9, v12 -; GCN-NEXT: v_mul_f32_e32 v11, s2, v11 -; GCN-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GCN-NEXT: v_mul_hi_u32 v10, v8, v10 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 -; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v6 -; GCN-NEXT: v_mul_lo_u32 v12, v12, v11 -; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 -; GCN-NEXT: v_mul_hi_u32 v12, v11, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 ; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; GCN-NEXT: v_xor_b32_e32 v7, v7, v14 -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GCN-NEXT: v_mul_lo_u32 v12, v8, v4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v9 -; GCN-NEXT: v_mul_hi_u32 v11, v2, v11 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] -; GCN-NEXT: v_mul_f32_e32 v10, s2, v10 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_mul_lo_u32 v0, v9, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v10 -; GCN-NEXT: v_mul_lo_u32 v10, v11, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v8 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v9 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v6 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v10 +; GCN-NEXT: v_xor_b32_e32 v4, v7, v14 +; GCN-NEXT: v_mul_hi_u32 v7, v9, v12 +; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4 +; GCN-NEXT: v_mul_hi_u32 v0, v10, v0 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v7 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GCN-NEXT: v_mul_hi_u32 v0, v2, v0 +; GCN-NEXT: v_mul_lo_u32 v10, v7, v5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v4 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 +; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 +; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] -; GCN-NEXT: v_sub_i32_e32 v9, vcc, v0, v5 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v7 +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[2:3] +; GCN-NEXT: v_add_i32_e32 v10, vcc, 1, v0 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; GCN-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5] -; GCN-NEXT: v_sub_i32_e32 v11, vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[2:3] -; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v1 +; GCN-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[4:5] +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v5 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[2:3] +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v2, v6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] +; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v7 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v8, v15 ; GCN-NEXT: v_xor_b32_e32 v5, v0, v16 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v15 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v16 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v7 -; GCN-NEXT: v_mul_lo_u32 v5, v5, v4 -; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GCN-NEXT: v_mul_hi_u32 v5, v4, v5 -; GCN-NEXT: v_xor_b32_e32 v3, v3, v9 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[4:5] -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v10 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v9, v12 +; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GCN-NEXT: v_mul_hi_u32 v5, v12, v5 +; GCN-NEXT: v_xor_b32_e32 v3, v3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v10 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GCN-NEXT: v_mul_hi_u32 v5, v3, v5 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc ; GCN-NEXT: v_xor_b32_e32 v2, v2, v17 -; GCN-NEXT: v_mul_lo_u32 v5, v4, v7 +; GCN-NEXT: v_mul_lo_u32 v6, v5, v4 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 -; GCN-NEXT: v_xor_b32_e32 v6, v9, v14 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v7 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v7 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v3, v3, v6 +; GCN-NEXT: v_xor_b32_e32 v7, v8, v14 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v4 +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] +; GCN-NEXT: v_sub_i32_e32 v6, vcc, v3, v4 +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v5 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; @@ -933,130 +929,129 @@ ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; TONGA-NEXT: s_mov_b32 s11, 0xf000 ; TONGA-NEXT: s_mov_b32 s10, -1 -; TONGA-NEXT: s_mov_b32 s6, s10 -; TONGA-NEXT: s_mov_b32 s7, s11 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s4, s2 -; TONGA-NEXT: s_mov_b32 s5, s3 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; TONGA-NEXT: s_mov_b32 s2, 0x4f7ffffe ; TONGA-NEXT: s_mov_b32 s8, s0 ; TONGA-NEXT: s_mov_b32 s9, s1 +; TONGA-NEXT: s_mov_b32 s0, s2 +; TONGA-NEXT: s_mov_b32 s1, s3 +; TONGA-NEXT: s_mov_b32 s2, s10 +; TONGA-NEXT: s_mov_b32 s3, s11 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; TONGA-NEXT: s_waitcnt vmcnt(1) ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 ; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 -; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 -; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 -; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 -; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v5 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0 +; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 +; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 ; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v4 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v9 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v4 +; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5 +; TONGA-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 +; TONGA-NEXT: v_mul_lo_u32 v9, v9, v8 +; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 ; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13 -; TONGA-NEXT: v_mul_f32_e32 v9, s2, v9 ; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11 +; TONGA-NEXT: v_mul_hi_u32 v9, v8, v9 +; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 +; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v5 ; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v6 -; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v9, v8 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 +; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v10 +; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v11 +; TONGA-NEXT: v_mul_lo_u32 v11, v8, v4 +; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 ; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2 ; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12 -; TONGA-NEXT: v_mul_f32_e32 v8, s2, v8 ; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v5 -; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v11, v11 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v11 +; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 ; TONGA-NEXT: v_mul_lo_u32 v12, v12, v9 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 -; TONGA-NEXT: v_mul_lo_u32 v10, v10, v8 -; TONGA-NEXT: v_mul_hi_u32 v12, v9, v12 -; TONGA-NEXT: v_mul_f32_e32 v11, s2, v11 -; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11 -; TONGA-NEXT: v_mul_hi_u32 v10, v8, v10 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v12, v9 -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v6 -; TONGA-NEXT: v_mul_lo_u32 v12, v12, v11 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 -; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 -; TONGA-NEXT: v_mul_hi_u32 v12, v11, v12 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v0, v4 ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7 -; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14 -; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v7 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v12, v11 -; TONGA-NEXT: v_mul_lo_u32 v12, v8, v4 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9 -; TONGA-NEXT: v_mul_hi_u32 v11, v2, v11 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v12 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] -; TONGA-NEXT: v_mul_f32_e32 v10, s2, v10 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_mul_lo_u32 v0, v9, v5 -; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v10 -; TONGA-NEXT: v_mul_lo_u32 v10, v11, v6 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v8 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v9 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v6 +; TONGA-NEXT: v_mul_lo_u32 v0, v0, v10 +; TONGA-NEXT: v_xor_b32_e32 v4, v7, v14 +; TONGA-NEXT: v_mul_hi_u32 v7, v9, v12 +; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4 +; TONGA-NEXT: v_mul_hi_u32 v0, v10, v0 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v9 +; TONGA-NEXT: v_mul_hi_u32 v7, v1, v7 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v10 +; TONGA-NEXT: v_mul_hi_u32 v0, v2, v0 +; TONGA-NEXT: v_mul_lo_u32 v10, v7, v5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v4 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 +; TONGA-NEXT: v_mul_lo_u32 v10, v0, v6 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 +; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 ; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v10 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v11 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v0, v5 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v7 +; TONGA-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[2:3] +; TONGA-NEXT: v_add_u32_e32 v10, vcc, 1, v0 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5] -; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[2:3] -; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v1 +; TONGA-NEXT: v_cndmask_b32_e64 v10, v0, v10, s[4:5] +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v5 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[2:3] +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v2, v6 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] +; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v7 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc +; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc ; TONGA-NEXT: v_xor_b32_e32 v1, v8, v15 ; TONGA-NEXT: v_xor_b32_e32 v5, v0, v16 ; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v15, v1 ; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v16, v5 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v7 -; TONGA-NEXT: v_mul_lo_u32 v5, v5, v4 -; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v9, v3 -; TONGA-NEXT: v_mul_hi_u32 v5, v4, v5 -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v9 -; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[4:5] -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v10 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4 -; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 +; TONGA-NEXT: v_mul_lo_u32 v5, v9, v12 +; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v3 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v8, v3 +; TONGA-NEXT: v_mul_hi_u32 v5, v12, v5 +; TONGA-NEXT: v_xor_b32_e32 v3, v3, v8 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v10 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v12 +; TONGA-NEXT: v_mul_hi_u32 v5, v3, v5 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17 -; TONGA-NEXT: v_mul_lo_u32 v5, v4, v7 +; TONGA-NEXT: v_mul_lo_u32 v6, v5, v4 ; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v17, v2 -; TONGA-NEXT: v_xor_b32_e32 v6, v9, v14 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v7 -; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v3, v7 -; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 -; TONGA-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6 -; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; TONGA-NEXT: v_xor_b32_e32 v7, v8, v14 +; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5 +; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v3, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v5 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; TONGA-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 +; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v7, v3 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; @@ -1072,7 +1067,6 @@ ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX9-NEXT: s_mov_b32 s8, s0 ; GFX9-NEXT: s_mov_b32 s9, s1 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -1113,13 +1107,13 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; GFX9-NEXT: v_mul_f32_e32 v8, s2, v8 +; GFX9-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX9-NEXT: v_mul_f32_e32 v10, s2, v10 -; GFX9-NEXT: v_mul_f32_e32 v12, s2, v12 +; GFX9-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; GFX9-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GFX9-NEXT: v_sub_u32_e32 v9, 0, v4 -; GFX9-NEXT: v_mul_f32_e32 v14, s2, v14 +; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 ; GFX9-NEXT: v_mul_lo_u32 v9, v9, v8 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1823,7 +1823,7 @@ ; GCN-NEXT: s_or_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mul_f32_e32 v1, s3, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v2, -v1, v0, s3 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -1850,7 +1850,7 @@ ; GCN-IR-NEXT: s_or_b32 s0, s0, 1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s3, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s3 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -1935,7 +1935,7 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -1955,7 +1955,7 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -1980,7 +1980,7 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v3, -v2, v1, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -2000,7 +2000,7 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -507,9 +507,9 @@ ; Check that "pulling out" SDWA operands works correctly. ; GCN-LABEL: {{^}}pulled_out_test: -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, v{{[0-9]+}} ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, v{{[0-9]+}} ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_and_b32_sdwa diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -647,7 +647,6 @@ ; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 -; SI-NEXT: s_mov_b32 s0, 0xffff ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 @@ -659,8 +658,8 @@ ; SI-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, v9, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, v8, v6 -; SI-NEXT: v_and_b32_e32 v3, s0, v3 -; SI-NEXT: v_and_b32_e32 v2, s0, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -507,7 +507,6 @@ ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 -; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 @@ -519,8 +518,8 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; CI-NEXT: v_lshlrev_b32_e32 v4, v9, v7 ; CI-NEXT: v_lshlrev_b32_e32 v5, v8, v6 -; CI-NEXT: v_and_b32_e32 v3, s0, v3 -; CI-NEXT: v_and_b32_e32 v2, s0, v2 +; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 @@ -575,7 +574,6 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s2, 0xff000000 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -584,8 +582,8 @@ ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_and_b32_e32 v4, s2, v4 -; VI-NEXT: v_and_b32_e32 v0, s2, v0 +; VI-NEXT: v_and_b32_e32 v4, 0xff000000, v4 +; VI-NEXT: v_and_b32_e32 v0, 0xff000000, v0 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -601,14 +599,13 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s0, 0xff00 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; CI-NEXT: v_and_b32_e32 v4, s0, v4 +; CI-NEXT: v_and_b32_e32 v4, 0xff00, v4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; CI-NEXT: v_and_b32_e32 v3, s0, v3 +; CI-NEXT: v_and_b32_e32 v3, 0xff00, v3 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -243,7 +243,6 @@ ; SI-NEXT: s_mov_b32 s8, s6 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s6, 0xffff ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -258,9 +257,9 @@ ; SI-NEXT: v_ashrrev_i32_e32 v0, v6, v0 ; SI-NEXT: v_ashrrev_i32_e32 v2, v2, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, s6, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v2, s6, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll @@ -64,8 +64,7 @@ ; CHECK-NEXT: v_bfe_i32 v5, v0, 0, 31 ; CHECK-NEXT: s_mov_b32 s4, 0x38e38e39 ; CHECK-NEXT: s_mov_b32 s5, 0xc71c71c7 -; CHECK-NEXT: s_brev_b32 s6, -2 -; CHECK-NEXT: s_mov_b32 s7, 0x7ffffffd +; CHECK-NEXT: s_mov_b32 s6, 0x7ffffffd ; CHECK-NEXT: v_mul_hi_i32 v5, v5, s4 ; CHECK-NEXT: v_mul_hi_i32 v4, v4, s4 ; CHECK-NEXT: v_mul_hi_i32 v3, v3, s5 @@ -84,12 +83,12 @@ ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_and_b32_e32 v2, s6, v2 -; CHECK-NEXT: v_and_b32_e32 v1, s6, v1 -; CHECK-NEXT: v_and_b32_e32 v0, s6, v0 +; CHECK-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s6, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1993,7 +1993,7 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mul_f32_e32 v1, s6, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mad_f32 v2, -v1, v0, s6 ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -2020,7 +2020,7 @@ ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s6, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s6 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1 @@ -2113,7 +2113,7 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v0 ; GCN-NEXT: v_or_b32_e32 v3, 1, v3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v4, -v2, v1, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -2135,7 +2135,7 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 30, v0 ; GCN-IR-NEXT: v_or_b32_e32 v3, 1, v3 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v4, -v2, v1, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -2162,7 +2162,7 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v0 ; GCN-NEXT: v_or_b32_e32 v3, 1, v3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v4, -v2, v1, s4 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 @@ -2184,7 +2184,7 @@ ; GCN-IR-NEXT: v_ashrrev_i32_e32 v3, 30, v0 ; GCN-IR-NEXT: v_or_b32_e32 v3, 1, v3 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v4, -v2, v1, s4 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -134,13 +134,11 @@ ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -156,15 +154,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -198,21 +195,18 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -225,22 +219,21 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 @@ -274,29 +267,26 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: s_movk_i32 s5, 0x8000 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 +; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 +; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 +; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -309,15 +299,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -328,14 +317,14 @@ ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 ; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 ; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -369,15 +358,14 @@ ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v2 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -389,15 +377,14 @@ ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v0, v2 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: s_brev_b32 s6, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -428,22 +415,21 @@ ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v3 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -455,22 +441,21 @@ ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v0, v3 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: s_brev_b32 s6, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -503,29 +488,28 @@ ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v4 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -537,29 +521,28 @@ ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v0, v4 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX8-NEXT: s_brev_b32 s6, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -594,57 +577,56 @@ ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v8 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v5, s6, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -656,57 +638,56 @@ ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v0, v8 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX8-NEXT: s_brev_b32 s6, 1 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v5, s6, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -748,114 +729,113 @@ ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v16 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 -; GFX6-NEXT: s_brev_b32 s6, 1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v1, v17 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v5, v21 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v5, s6, v5 +; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v6, v22 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v7, v23 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v8, v24 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v8, s6, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v9, v25 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 ; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v9, s6, v9 +; GFX6-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v26 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 ; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v10, s6, v10 +; GFX6-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v11, v27 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 ; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v11, s6, v11 +; GFX6-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v12, v28 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 ; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v12, s6, v12 +; GFX6-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v13, v29 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 ; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v13, s6, v13 +; GFX6-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v14, v30 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 ; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v14, s6, v14 +; GFX6-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc ; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v31 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 ; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v15, s6, v15 +; GFX6-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -866,114 +846,113 @@ ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v0, v16 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 -; GFX8-NEXT: s_brev_b32 s6, 1 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v1, v17 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 ; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 ; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v5, v21 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v5 ; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v5, s6, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v6, v22 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v6 ; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v7, v23 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v7 ; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v8, v24 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v8 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v8, s6, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v9, v25 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v9 ; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v9, s6, v9 +; GFX8-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v10, v26 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v10 ; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v10, s6, v10 +; GFX8-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v11, v27 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v11 ; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v11, s6, v11 +; GFX8-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v12, v28 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v12 ; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v12, s6, v12 +; GFX8-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v13, v29 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v13 ; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v13, s6, v13 +; GFX8-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v14, v30 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v14 ; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v14, s6, v14 +; GFX8-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc ; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v31 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v31 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 ; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v15, s6, v15 +; GFX8-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 ; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -96,10 +96,9 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 ; GFX9-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX9-NEXT: v_fma_f16 v0, v0, v2, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_fma_f16 v7, v9, v8, v7 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -8,11 +8,10 @@ ; GFX6-LABEL: v_uaddsat_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 0xff, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i8: @@ -46,11 +45,10 @@ ; GFX6-LABEL: v_uaddsat_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_i16: @@ -110,15 +108,14 @@ ; GFX6-LABEL: v_uaddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -152,20 +149,19 @@ ; GFX6-LABEL: v_uaddsat_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_min_u32_e32 v3, s4, v2 +; GFX6-NEXT: v_min_u32_e32 v3, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -202,25 +198,24 @@ ; GFX6-LABEL: v_uaddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v6 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v7 -; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_u32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -343,7 +343,6 @@ ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s2, 0x4f7ffffe ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -353,8 +352,8 @@ ; SI-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; SI-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; SI-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 -; SI-NEXT: v_mul_f32_e32 v4, s2, v4 -; SI-NEXT: v_mul_f32_e32 v5, s2, v5 +; SI-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; SI-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 ; SI-NEXT: v_cvt_u32_f32_e32 v4, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v5, v5 ; SI-NEXT: v_mul_lo_u32 v6, v6, v4 @@ -399,7 +398,6 @@ ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s2, 0x4f7ffffe ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -409,8 +407,8 @@ ; VI-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; VI-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; VI-NEXT: v_sub_u32_e32 v7, vcc, 0, v3 -; VI-NEXT: v_mul_f32_e32 v4, s2, v4 -; VI-NEXT: v_mul_f32_e32 v5, s2, v5 +; VI-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 +; VI-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 ; VI-NEXT: v_cvt_u32_f32_e32 v4, v4 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v5 ; VI-NEXT: v_mul_lo_u32 v6, v6, v4 @@ -451,15 +449,14 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GCN-NEXT: v_mul_f32_e32 v4, s2, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GCN-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GCN-NEXT: v_mul_f32_e32 v5, s2, v5 +; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v5 ; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 ; GCN-NEXT: v_mul_lo_u32 v5, v4, v6 @@ -613,7 +610,6 @@ ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: s_mov_b32 s2, 0x4f7ffffe ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -625,10 +621,10 @@ ; SI-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; SI-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; SI-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; SI-NEXT: v_mul_f32_e32 v8, s2, v8 -; SI-NEXT: v_mul_f32_e32 v10, s2, v10 -; SI-NEXT: v_mul_f32_e32 v12, s2, v12 -; SI-NEXT: v_mul_f32_e32 v14, s2, v14 +; SI-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; SI-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; SI-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; SI-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 ; SI-NEXT: v_cvt_u32_f32_e32 v8, v8 ; SI-NEXT: v_cvt_u32_f32_e32 v10, v10 ; SI-NEXT: v_cvt_u32_f32_e32 v12, v12 @@ -708,7 +704,6 @@ ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; VI-NEXT: s_mov_b32 s2, 0x4f7ffffe ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -720,10 +715,10 @@ ; VI-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; VI-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; VI-NEXT: v_rcp_iflag_f32_e32 v14, v14 -; VI-NEXT: v_mul_f32_e32 v8, s2, v8 -; VI-NEXT: v_mul_f32_e32 v10, s2, v10 -; VI-NEXT: v_mul_f32_e32 v12, s2, v12 -; VI-NEXT: v_mul_f32_e32 v14, s2, v14 +; VI-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; VI-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; VI-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; VI-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 ; VI-NEXT: v_cvt_u32_f32_e32 v8, v8 ; VI-NEXT: v_cvt_u32_f32_e32 v10, v10 ; VI-NEXT: v_cvt_u32_f32_e32 v12, v12 @@ -803,7 +798,6 @@ ; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: v_mov_b32_e32 v9, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -815,10 +809,10 @@ ; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 ; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 ; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 -; GCN-NEXT: v_mul_f32_e32 v10, s2, v10 -; GCN-NEXT: v_mul_f32_e32 v12, s2, v12 -; GCN-NEXT: v_mul_f32_e32 v14, s2, v14 -; GCN-NEXT: v_mul_f32_e32 v16, s2, v16 +; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 +; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 +; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 ; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1725,7 +1725,7 @@ ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s4 @@ -1746,7 +1746,7 @@ ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s4 @@ -1820,7 +1820,7 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s4 @@ -1837,7 +1837,7 @@ ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s4 @@ -1859,7 +1859,7 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x47000000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s4 @@ -1876,7 +1876,7 @@ ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s4, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x47000000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -156,19 +156,18 @@ ; GFX6-LABEL: test_udivrem_v2: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX6-NEXT: s_sub_i32 s2, 0, s7 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 @@ -176,22 +175,22 @@ ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s7, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -201,18 +200,17 @@ ; GFX8-LABEL: test_udivrem_v2: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX8-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX8-NEXT: s_sub_i32 s2, 0, s6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: s_sub_i32 s2, 0, s6 ; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX8-NEXT: s_sub_i32 s2, 0, s7 ; GFX8-NEXT: v_mul_lo_u32 v3, s2, v1 @@ -230,11 +228,11 @@ ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s7, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s7, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -319,26 +317,25 @@ ; GFX6-LABEL: test_udivrem_v4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_sub_i32 s2, 0, s8 -; GFX6-NEXT: s_sub_i32 s12, 0, s9 +; GFX6-NEXT: s_sub_i32 s12, 0, s8 +; GFX6-NEXT: s_sub_i32 s13, 0, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 -; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 @@ -346,7 +343,7 @@ ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -366,7 +363,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: s_sub_i32 s4, 0, s11 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 @@ -398,7 +395,6 @@ ; GFX8-LABEL: test_udivrem_v4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s12, 0x4f7ffffe ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8 @@ -409,9 +405,9 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s11 -; GFX8-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, s12, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_mul_lo_u32 v2, s2, v0 @@ -424,18 +420,18 @@ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v4, v1 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX8-NEXT: v_mul_f32_e32 v2, s12, v3 +; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v0 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, s2, v2 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 @@ -443,7 +439,7 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX8-NEXT: s_sub_i32 s2, 0, s11 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_f32_e32 v3, s12, v4 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s9, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s6, v2 diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll @@ -5,12 +5,11 @@ ; CHECK-LABEL: test_urem_odd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1fff -; CHECK-NEXT: s_movk_i32 s5, 0x667 -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x1fff, v0 +; CHECK-NEXT: s_movk_i32 s4, 0x667 ; CHECK-NEXT: v_mul_u32_u24_e32 v0, 0xccd, v0 -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s5, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x1fff, v0 +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %urem = urem i13 %X, 5 @@ -56,10 +55,9 @@ ; CHECK-LABEL: test_urem_negative_odd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x1ff -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x1ff, v0 ; CHECK-NEXT: v_mul_u32_u24_e32 v0, 0x133, v0 -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x1ff, v0 ; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -72,28 +70,27 @@ ; CHECK-LABEL: test_urem_vec: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x7ff -; CHECK-NEXT: s_mov_b32 s5, 0x8311eb33 -; CHECK-NEXT: s_mov_b32 s6, 0x20140c -; CHECK-NEXT: s_mov_b32 s7, 0xb6db6db7 -; CHECK-NEXT: s_mov_b32 s8, 0x49249249 -; CHECK-NEXT: s_mov_b32 s9, 0x24924924 -; CHECK-NEXT: s_mov_b32 s10, 0xaaaaaaab -; CHECK-NEXT: s_mov_b32 s11, 0x2aaaaaaa -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 -; CHECK-NEXT: v_and_b32_e32 v1, s4, v1 -; CHECK-NEXT: v_and_b32_e32 v2, s4, v2 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, s5 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, s7 -; CHECK-NEXT: v_mul_lo_u32 v0, v0, s10 +; CHECK-NEXT: v_and_b32_e32 v0, 0x7ff, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7ff, v1 +; CHECK-NEXT: v_and_b32_e32 v2, 0x7ff, v2 +; CHECK-NEXT: s_mov_b32 s4, 0x8311eb33 +; CHECK-NEXT: s_mov_b32 s5, 0x20140c +; CHECK-NEXT: s_mov_b32 s6, 0xb6db6db7 +; CHECK-NEXT: s_mov_b32 s7, 0x49249249 +; CHECK-NEXT: s_mov_b32 s8, 0x24924924 +; CHECK-NEXT: s_mov_b32 s9, 0xaaaaaaab +; CHECK-NEXT: s_mov_b32 s10, 0x2aaaaaaa +; CHECK-NEXT: v_mul_lo_u32 v2, v2, s4 +; CHECK-NEXT: v_mul_lo_u32 v1, v1, s6 +; CHECK-NEXT: v_mul_lo_u32 v0, v0, s9 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xf9dc299a, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, s8, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, s7, v1 ; CHECK-NEXT: v_alignbit_b32 v0, v0, v0, 1 -; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s11, v0 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s9, v1 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s8, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s6, v2 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %urem = urem <3 x i11> %X, diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -488,84 +488,82 @@ define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem31_v2i64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-NEXT: s_lshr_b32 s8, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: s_lshr_b32 s3, s3, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GCN-NEXT: s_lshr_b32 s2, s11, 1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GCN-NEXT: s_lshr_b32 s1, s3, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: v_mul_f32_e32 v2, v3, v2 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, s3 +; GCN-NEXT: v_mad_f32 v4, -v2, v0, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_brev_b32 s0, -2 -; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_f32_e32 v2, v5, v6 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v3, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v2, s2 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem31_v2i64: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-IR-NEXT: s_lshr_b32 s3, s3, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GCN-IR-NEXT: s_lshr_b32 s2, s11, 1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GCN-IR-NEXT: s_lshr_b32 s1, s3, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v2 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s3 +; GCN-IR-NEXT: v_mad_f32 v4, -v2, v0, v1 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: s_brev_b32 s0, -2 -; GCN-IR-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_mul_f32_e32 v2, v5, v6 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GCN-IR-NEXT: v_mad_f32 v2, -v2, v3, v5 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm @@ -638,84 +636,82 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem23_64_v2i64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-NEXT: s_lshr_b32 s8, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: s_lshr_b32 s3, s3, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GCN-NEXT: s_lshr_b32 s2, s11, 9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GCN-NEXT: s_lshr_b32 s1, s3, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: v_mul_f32_e32 v2, v3, v2 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, s3 +; GCN-NEXT: v_mad_f32 v4, -v2, v0, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_brev_b32 s0, -2 -; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_f32_e32 v2, v5, v6 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GCN-NEXT: v_mad_f32 v2, -v2, v3, v5 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v2, s2 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem23_64_v2i64: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 +; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 +; GCN-IR-NEXT: s_lshr_b32 s8, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-IR-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-IR-NEXT: s_lshr_b32 s3, s3, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GCN-IR-NEXT: s_lshr_b32 s2, s11, 9 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GCN-IR-NEXT: s_lshr_b32 s1, s3, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v2 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s3 +; GCN-IR-NEXT: v_mad_f32 v4, -v2, v0, v1 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: s_brev_b32 s0, -2 -; GCN-IR-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GCN-IR-NEXT: v_mul_f32_e32 v2, v5, v6 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GCN-IR-NEXT: v_mad_f32 v2, -v2, v3, v5 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 ; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm @@ -1401,7 +1397,7 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s5, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-NEXT: v_mad_f32 v1, -v1, v0, s5 @@ -1424,7 +1420,7 @@ ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v1, s5, v1 +; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x41c00000, v1 ; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1 ; GCN-IR-NEXT: v_mad_f32 v1, -v1, v0, s5 @@ -1506,7 +1502,7 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v2, -v2, v1, s4 @@ -1525,7 +1521,7 @@ ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x41c00000 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x41c00000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v2, -v2, v1, s4 @@ -1549,7 +1545,7 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-NEXT: v_mad_f32 v2, -v2, v1, s4 @@ -1568,7 +1564,7 @@ ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s4, 0x47000000 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, s4, v2 +; GCN-IR-NEXT: v_mul_f32_e32 v2, 0x47000000, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GCN-IR-NEXT: v_mad_f32 v2, -v2, v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -8,9 +8,8 @@ ; GFX6-LABEL: v_usubsat_i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -43,9 +42,8 @@ ; GFX6-LABEL: v_usubsat_i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -214,11 +212,10 @@ ; GFX6-LABEL: v_usubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v4, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 @@ -256,16 +253,15 @@ ; GFX6-LABEL: v_usubsat_v3i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v6, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v6 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -305,18 +301,17 @@ ; GFX6-LABEL: v_usubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v9, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v7 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -954,13 +954,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, v2, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v2, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_1100: @@ -1367,7 +1366,7 @@ ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1