Index: llvm/include/llvm/CodeGen/MachineInstr.h =================================================================== --- llvm/include/llvm/CodeGen/MachineInstr.h +++ llvm/include/llvm/CodeGen/MachineInstr.h @@ -1745,6 +1745,9 @@ /// Return true if all the defs of this instruction are dead. bool allDefsAreDead() const; + /// Return true if all the implicit defs of this instruction are dead. + bool allImplicitDefsAreDead() const; + /// Return a valid size if the instruction is a spill instruction. std::optional getSpillSize(const TargetInstrInfo *TII) const; Index: llvm/lib/CodeGen/MachineInstr.cpp =================================================================== --- llvm/lib/CodeGen/MachineInstr.cpp +++ llvm/lib/CodeGen/MachineInstr.cpp @@ -1497,6 +1497,16 @@ return true; } +bool MachineInstr::allImplicitDefsAreDead() const { + for (const MachineOperand &MO : implicit_operands()) { + if (!MO.isReg() || MO.isUse()) + continue; + if (!MO.isDead()) + return false; + } + return true; +} + /// copyImplicitOps - Copy implicit register operands from specified /// instruction to this instruction. void MachineInstr::copyImplicitOps(MachineFunction &MF, Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1035,6 +1035,9 @@ // selection. // TODO: See if a frame index with a fixed offset can fold. bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const { + if (!MI->allImplicitDefsAreDead()) + return false; + unsigned Opc = MI->getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -41,10 +41,11 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f ; GFX8-NEXT: s_and_b32 s1, s1, 0x7f -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -72,10 +73,11 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f ; GFX9-NEXT: s_and_b32 s1, s1, 0x7f -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s3, 0xffff, 1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -104,7 +106,6 @@ ; GFX10-NEXT: s_and_b32 s1, s1, 0x7f ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_lo_u32 v1, v0, -7 @@ -113,6 +114,8 @@ ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10-NEXT: s_and_b32 s2, 0xffff, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -136,7 +139,6 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -148,25 +150,27 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11-NEXT: s_and_b32 s2, 0xffff, 1 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u16 v1, 6, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -349,24 +353,26 @@ ; ; GFX8-LABEL: s_fshl_i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s3, s2, 7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_and_b32 s3, s2, 7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, 1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i8: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s3, s2, 7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_and_b32 s3, s2, 7 +; GFX9-NEXT: s_and_b32 s3, 0xffff, 1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -374,10 +380,11 @@ ; GFX10-LABEL: s_fshl_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_and_b32 s3, s2, 7 +; GFX10-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s3, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s4 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -386,10 +393,11 @@ ; GFX11-LABEL: s_fshl_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_and_b32 s3, s2, 7 +; GFX11-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s3, s2, 7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshr_b32 s1, s1, s4 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -481,37 +489,41 @@ ; GFX8-LABEL: s_fshl_i8_4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s2, 0xffff, 4 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i8_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s2, 0xffff, 4 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_lshr_b32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i8_4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_and_b32 s2, 0xffff, 4 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 4 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i8_4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_and_b32 s2, 0xffff, 4 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s1, s1, 4 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4) @@ -577,38 +589,46 @@ ; ; GFX8-LABEL: s_fshl_i8_5: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s2, 0xffff, 5 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 5 -; GFX8-NEXT: s_lshr_b32 s1, s1, 3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 3 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i8_5: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s2, 0xffff, 5 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 5 -; GFX9-NEXT: s_lshr_b32 s1, s1, 3 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i8_5: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 5 +; GFX10-NEXT: s_and_b32 s2, 0xffff, 5 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 3 +; GFX10-NEXT: s_and_b32 s3, 0xffff, 3 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i8_5: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_and_b32 s2, 0xffff, 5 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s1, s1, 3 +; GFX11-NEXT: s_and_b32 s3, 0xffff, 3 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5) @@ -691,14 +711,15 @@ ; GFX8-LABEL: s_fshl_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 +; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s6, 0xffff, 1 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 -; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_lshr_b32 s3, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 @@ -706,26 +727,28 @@ ; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s3, s3, s6 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s2, 0xffff, 8 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 +; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s6, 0xffff, 1 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 -; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s1, s1, s6 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 @@ -733,12 +756,13 @@ ; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_lshr_b32 s3, s3, s6 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s2, 0xffff, 8 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; @@ -749,23 +773,25 @@ ; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_and_b32 s7, 0xffff, 1 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s6 ; GFX10-NEXT: s_and_b32 s6, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 -; GFX10-NEXT: s_lshr_b32 s4, s4, 1 +; GFX10-NEXT: s_lshr_b32 s4, s4, s7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s7 ; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff +; GFX10-NEXT: s_and_b32 s2, 0xffff, 8 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; @@ -776,23 +802,25 @@ ; GFX11-NEXT: s_and_b32 s4, s4, 0xff ; GFX11-NEXT: s_and_b32 s6, s2, 7 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_and_b32 s7, 0xffff, 1 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s6, s5, 7 ; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 -; GFX11-NEXT: s_lshr_b32 s4, s4, 1 +; GFX11-NEXT: s_lshr_b32 s4, s4, s7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshr_b32 s1, s1, s7 ; GFX11-NEXT: s_lshl_b32 s3, s3, s6 ; GFX11-NEXT: s_lshr_b32 s4, s4, s5 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_or_b32 s2, s3, s4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_and_b32 s2, 0xffff, 8 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_lshl_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -1001,18 +1029,19 @@ ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 +; GFX8-NEXT: s_and_b32 s12, s2, 7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s5, s0, 24 +; GFX8-NEXT: s_lshl_b32 s0, s0, s12 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s12, 0xffff, 1 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s2, 24 -; GFX8-NEXT: s_and_b32 s12, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_lshr_b32 s3, s0, 8 -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, s12 +; GFX8-NEXT: s_lshr_b32 s1, s1, s12 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 @@ -1020,7 +1049,7 @@ ; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s3, s3, s12 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s2, s10, 7 @@ -1028,14 +1057,14 @@ ; GFX8-NEXT: s_and_b32 s4, s7, 0xff ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 -; GFX8-NEXT: s_lshr_b32 s4, s4, 1 +; GFX8-NEXT: s_lshr_b32 s4, s4, s12 ; GFX8-NEXT: s_lshr_b32 s3, s4, s3 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 ; GFX8-NEXT: s_lshl_b32 s3, s5, s3 -; GFX8-NEXT: s_lshr_b32 s5, s8, 1 +; GFX8-NEXT: s_lshr_b32 s5, s8, s12 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshr_b32 s4, s5, s4 @@ -1054,18 +1083,19 @@ ; GFX9-NEXT: s_lshr_b32 s6, s1, 8 ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 +; GFX9-NEXT: s_and_b32 s12, s2, 7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s12 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s12, 0xffff, 1 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 ; GFX9-NEXT: s_lshr_b32 s11, s2, 24 -; GFX9-NEXT: s_and_b32 s12, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_lshl_b32 s0, s0, s12 +; GFX9-NEXT: s_lshr_b32 s1, s1, s12 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 @@ -1073,7 +1103,7 @@ ; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_lshr_b32 s3, s3, s12 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s10, 7 @@ -1081,14 +1111,14 @@ ; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 -; GFX9-NEXT: s_lshr_b32 s4, s4, 1 +; GFX9-NEXT: s_lshr_b32 s4, s4, s12 ; GFX9-NEXT: s_lshr_b32 s3, s4, s3 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: s_and_b32 s3, s11, 7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 ; GFX9-NEXT: s_lshl_b32 s3, s5, s3 -; GFX9-NEXT: s_lshr_b32 s5, s8, 1 +; GFX9-NEXT: s_lshr_b32 s5, s8, s12 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_lshr_b32 s4, s5, s4 @@ -1108,20 +1138,21 @@ ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_and_b32 s13, 0xffff, 1 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_and_b32 s12, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s13 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_and_b32 s2, s6, 0xff ; GFX10-NEXT: s_and_b32 s6, s9, 7 ; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_lshr_b32 s2, s2, s13 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 ; GFX10-NEXT: s_lshl_b32 s0, s0, s12 @@ -1133,12 +1164,12 @@ ; GFX10-NEXT: s_and_b32 s3, s10, 7 ; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_andn2_b32 s6, 7, s10 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_lshr_b32 s2, s2, s13 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshr_b32 s2, s2, s6 ; GFX10-NEXT: s_and_b32 s4, s11, 7 ; GFX10-NEXT: s_andn2_b32 s6, 7, s11 -; GFX10-NEXT: s_lshr_b32 s7, s8, 1 +; GFX10-NEXT: s_lshr_b32 s7, s8, s13 ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s7, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 @@ -1161,20 +1192,21 @@ ; GFX11-NEXT: s_lshr_b32 s7, s1, 16 ; GFX11-NEXT: s_lshr_b32 s8, s1, 24 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-NEXT: s_and_b32 s13, 0xffff, 1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshr_b32 s9, s2, 8 ; GFX11-NEXT: s_lshr_b32 s10, s2, 16 ; GFX11-NEXT: s_lshr_b32 s11, s2, 24 ; GFX11-NEXT: s_and_b32 s12, s2, 7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshr_b32 s1, s1, s13 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_and_b32 s2, s6, 0xff ; GFX11-NEXT: s_and_b32 s6, s9, 7 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_lshr_b32 s2, s2, s13 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s0, 24 ; GFX11-NEXT: s_lshl_b32 s0, s0, s12 @@ -1186,12 +1218,12 @@ ; GFX11-NEXT: s_and_b32 s3, s10, 7 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_lshr_b32 s2, s2, s13 ; GFX11-NEXT: s_lshl_b32 s3, s4, s3 ; GFX11-NEXT: s_lshr_b32 s2, s2, s6 ; GFX11-NEXT: s_and_b32 s4, s11, 7 ; GFX11-NEXT: s_and_not1_b32 s6, 7, s11 -; GFX11-NEXT: s_lshr_b32 s7, s8, 1 +; GFX11-NEXT: s_lshr_b32 s7, s8, s13 ; GFX11-NEXT: s_lshl_b32 s4, s5, s4 ; GFX11-NEXT: s_lshr_b32 s5, s7, s6 ; GFX11-NEXT: s_or_b32 s2, s3, s2 @@ -1936,10 +1968,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_and_b32 s6, s6, 0xff +; GFX8-NEXT: s_and_b32 s10, 0xffff, 8 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 @@ -1950,7 +1983,7 @@ ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s10 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 @@ -1964,7 +1997,7 @@ ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 @@ -1975,7 +2008,7 @@ ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, 8 +; GFX8-NEXT: s_lshl_b32 s3, s3, s10 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s3, s8, s3 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 @@ -1988,7 +2021,7 @@ ; GFX8-NEXT: s_lshr_b32 s7, s4, 16 ; GFX8-NEXT: s_lshr_b32 s8, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s4, s4, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 @@ -2005,7 +2038,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_lshl_b32 s5, s5, s10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 @@ -2065,12 +2098,13 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 +; GFX9-NEXT: s_and_b32 s12, 0xffff, 8 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s0, s0, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 @@ -2081,7 +2115,7 @@ ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshl_b32 s1, s1, s12 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 ; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 @@ -2094,7 +2128,7 @@ ; GFX9-NEXT: s_lshr_b32 s9, s2, 16 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s2, s2, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 @@ -2106,7 +2140,7 @@ ; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 +; GFX9-NEXT: s_lshl_b32 s3, s3, s12 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s3, s10, s3 ; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 @@ -2120,7 +2154,7 @@ ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_lshr_b32 s10, s4, 24 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 @@ -2132,7 +2166,7 @@ ; GFX9-NEXT: s_lshr_b32 s11, s5, 8 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_lshl_b32 s5, s5, s12 ; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s5, s10, s5 ; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 @@ -2194,61 +2228,61 @@ ; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: s_and_b32 s10, 0xffff, 8 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_lshl_b32 s6, s6, s10 ; GFX10-NEXT: s_and_b32 s7, s7, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_and_b32 s6, 0xffff, s7 ; GFX10-NEXT: s_lshr_b32 s7, s4, 8 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: s_and_b32 s7, s7, 0xff -; GFX10-NEXT: s_lshr_b32 s11, s4, 24 -; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_lshr_b32 s12, s4, 24 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_lshl_b32 s7, s7, 8 -; GFX10-NEXT: s_lshr_b32 s12, s5, 8 -; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, s10 +; GFX10-NEXT: s_lshr_b32 s13, s5, 8 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_and_b32 s7, s10, 0xff +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_and_b32 s7, s11, 0xff ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_and_b32 s5, s5, 0xff ; GFX10-NEXT: s_lshl_b32 s7, s7, 16 -; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 ; GFX10-NEXT: s_or_b32 s4, s4, s7 -; GFX10-NEXT: s_and_b32 s7, s12, 0xff -; GFX10-NEXT: s_or_b32 s5, s11, s5 +; GFX10-NEXT: s_and_b32 s7, s13, 0xff +; GFX10-NEXT: s_or_b32 s5, s12, s5 ; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 ; GFX10-NEXT: s_lshr_b32 s9, s1, 8 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_or_b32 s5, s5, s7 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_and_b32 s7, s9, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s1, s10 +; GFX10-NEXT: s_lshr_b32 s9, s2, 16 ; GFX10-NEXT: s_or_b32 s1, s8, s1 ; GFX10-NEXT: s_lshr_b32 s8, s2, 8 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 -; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s8, s8, s10 ; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: s_lshr_b32 s4, s3, 8 @@ -2258,13 +2292,13 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: s_or_b32 s3, s10, s3 +; GFX10-NEXT: s_or_b32 s3, s11, s3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 @@ -2280,16 +2314,17 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: s_lshl_b32 s6, s6, 16 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s7, s7, 16 -; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: s_lshr_b32 s2, s3, 1 +; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_or_b32 s1, s1, s7 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 @@ -2314,75 +2349,75 @@ ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: s_and_b32 s9, 0xffff, 8 ; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_lshr_b32 s8, s0, 24 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_lshr_b32 s10, s4, 24 +; GFX11-NEXT: s_lshl_b32 s6, s6, s9 +; GFX11-NEXT: s_lshr_b32 s11, s4, 24 ; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s6, s7, 0xff ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1 -; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_lshr_b32 s7, s4, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_lshr_b32 s6, s4, 8 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX11-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, s9 +; GFX11-NEXT: s_lshr_b32 s12, s5, 8 ; GFX11-NEXT: s_or_b32 s4, s4, s6 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 -; GFX11-NEXT: s_lshr_b32 s11, s5, 8 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 ; GFX11-NEXT: s_and_b32 s5, s5, 0xff ; GFX11-NEXT: s_or_b32 s4, s4, s6 -; GFX11-NEXT: s_lshl_b32 s5, s5, 8 -; GFX11-NEXT: s_and_b32 s6, s11, 0xff -; GFX11-NEXT: s_or_b32 s5, s10, s5 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-NEXT: s_lshl_b32 s5, s5, s9 +; GFX11-NEXT: s_and_b32 s6, s12, 0xff +; GFX11-NEXT: s_or_b32 s5, s11, s5 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_hi_u32 v2, v1, v3 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 -; GFX11-NEXT: s_lshr_b32 s9, s1, 8 ; GFX11-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_lshr_b32 s10, s1, 8 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_and_b32 s7, s9, 0xff ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s7, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, s9 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 -; GFX11-NEXT: s_lshr_b32 s7, s2, 8 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX11-NEXT: s_lshr_b32 s7, s2, 8 ; GFX11-NEXT: s_or_b32 s1, s8, s1 ; GFX11-NEXT: s_lshr_b32 s8, s2, 16 ; GFX11-NEXT: s_and_b32 s7, s7, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s3, 8 -; GFX11-NEXT: s_lshl_b32 s7, s7, 8 -; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-NEXT: s_lshl_b32 s7, s7, s9 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX11-NEXT: s_lshr_b32 s4, s2, 24 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: s_or_b32 s2, s2, s7 -; GFX11-NEXT: s_or_b32 s3, s4, s3 +; GFX11-NEXT: s_lshl_b32 s3, s3, s9 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s5, v1 ; GFX11-NEXT: s_and_b32 s5, s8, 0xff ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo @@ -2392,16 +2427,17 @@ ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_and_b32 s4, s9, 0xff +; GFX11-NEXT: s_or_b32 s3, s4, s3 ; GFX11-NEXT: s_or_b32 s2, s2, s5 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_and_b32 s4, s10, 0xff ; GFX11-NEXT: s_lshr_b32 s2, s2, 1 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s6, s6, 16 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 @@ -3353,12 +3389,13 @@ ; GFX8-LABEL: s_fshl_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s3, s2, 15 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, 1 +; GFX8-NEXT: s_lshr_b32 s1, s1, s3 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -3366,12 +3403,13 @@ ; GFX9-LABEL: s_fshl_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s2, 15 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_andn2_b32 s2, 15, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_and_b32 s3, 0xffff, 1 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -3381,8 +3419,9 @@ ; GFX10-NEXT: s_and_b32 s3, s2, 15 ; GFX10-NEXT: s_andn2_b32 s2, 15, s2 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s4 ; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 @@ -3394,8 +3433,9 @@ ; GFX11-NEXT: s_and_b32 s3, s2, 15 ; GFX11-NEXT: s_and_not1_b32 s2, 15, s2 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshr_b32 s1, s1, s4 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 @@ -3416,33 +3456,41 @@ ; ; GFX8-LABEL: s_fshl_i16_4: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s2, 0xffff, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 12 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 12 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i16_4: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s2, 0xffff, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_lshr_b32 s1, s1, 12 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 12 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i16_4: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s2, 0xffff, 4 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_lshr_b32 s1, s1, 12 +; GFX10-NEXT: s_and_b32 s3, 0xffff, 12 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i16_4: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s2, 0xffff, 4 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_lshr_b32 s1, s1, 12 +; GFX11-NEXT: s_and_b32 s3, 0xffff, 12 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3460,33 +3508,41 @@ ; ; GFX8-LABEL: s_fshl_i16_5: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s2, 0xffff, 5 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 5 -; GFX8-NEXT: s_lshr_b32 s1, s1, 11 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 11 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i16_5: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s2, 0xffff, 5 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 5 -; GFX9-NEXT: s_lshr_b32 s1, s1, 11 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 11 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i16_5: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s2, 0xffff, 5 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 5 -; GFX10-NEXT: s_lshr_b32 s1, s1, 11 +; GFX10-NEXT: s_and_b32 s3, 0xffff, 11 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i16_5: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s2, 0xffff, 5 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_lshr_b32 s1, s1, 11 +; GFX11-NEXT: s_and_b32 s3, 0xffff, 11 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3672,8 +3728,9 @@ ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, 1 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog @@ -3684,8 +3741,9 @@ ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s1, 0xffff, 1 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -3695,7 +3753,8 @@ ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0 ; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1 @@ -3707,10 +3766,10 @@ ; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 ; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -3801,10 +3860,11 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, s2, v0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, s2, v0 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog @@ -3813,10 +3873,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s2, s1, 15 ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, s2, v0 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 1 +; GFX9-NEXT: s_lshr_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, s2, v0 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -3826,8 +3887,9 @@ ; GFX10-NEXT: s_and_b32 s2, s1, 15 ; GFX10-NEXT: s_andn2_b32 s1, 15, s1 ; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10-NEXT: s_and_b32 s3, 0xffff, 1 ; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0 -; GFX10-NEXT: s_lshr_b32 s0, s0, 1 +; GFX10-NEXT: s_lshr_b32 s0, s0, s3 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3838,8 +3900,9 @@ ; GFX11-NEXT: s_and_b32 s2, s1, 15 ; GFX11-NEXT: s_and_not1_b32 s1, 15, s1 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_and_b32 s3, 0xffff, 1 ; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0 -; GFX11-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-NEXT: s_lshr_b32 s0, s0, s3 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s0, s0, s1 @@ -3878,23 +3941,24 @@ ; ; GFX8-LABEL: s_fshl_v2i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s6, s2, 15 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 ; GFX8-NEXT: s_lshr_b32 s5, s2, 16 -; GFX8-NEXT: s_and_b32 s6, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s6, 0xffff, 1 +; GFX8-NEXT: s_lshr_b32 s1, s1, s6 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s5 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_lshr_b32 s3, s4, 1 +; GFX8-NEXT: s_lshr_b32 s3, s4, s6 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 @@ -4071,13 +4135,17 @@ ; GFX6-LABEL: v_fshl_v2i16_4_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s4, 0xffff, 4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 11, v2 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 11 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 7 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -4158,19 +4226,20 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s1, 0xffff, 1 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: s_lshr_b32 s0, s3, s1 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 @@ -4354,20 +4423,21 @@ ; ; GFX8-LABEL: v_fshl_v2i16_vss: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s4, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v0 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 ; GFX8-NEXT: s_and_b32 s0, s3, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: s_lshr_b32 s0, s2, s4 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 @@ -4476,23 +4546,24 @@ ; ; GFX8-LABEL: s_fshl_v3i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s9, s4, 15 +; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s4, 16 -; GFX8-NEXT: s_and_b32 s9, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s9 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s9, 0xffff, 1 +; GFX8-NEXT: s_lshr_b32 s2, s2, s9 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s9 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s8, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s8 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_lshr_b32 s6, s7, 1 +; GFX8-NEXT: s_lshr_b32 s6, s7, s9 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 @@ -4501,7 +4572,7 @@ ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s3, s3, s9 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 @@ -4832,23 +4903,24 @@ ; ; GFX8-LABEL: s_fshl_v4i16: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s12, s4, 15 +; GFX8-NEXT: s_and_b32 s12, 0xffff, s12 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 16 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_and_b32 s12, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s12 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_and_b32 s12, 0xffff, s12 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s12, 0xffff, 1 +; GFX8-NEXT: s_lshr_b32 s2, s2, s12 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, s12 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s2, s10, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s10 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 -; GFX8-NEXT: s_lshr_b32 s6, s8, 1 +; GFX8-NEXT: s_lshr_b32 s6, s8, s12 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 @@ -4860,14 +4932,14 @@ ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s3, s3, s12 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_or_b32 s1, s1, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s11 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_lshr_b32 s5, s9, 1 +; GFX8-NEXT: s_lshr_b32 s5, s9, s12 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s3, s7, s3 ; GFX8-NEXT: s_lshr_b32 s4, s5, s4 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -41,10 +41,11 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s1, 0x7f +; GFX8-NEXT: s_and_b32 s3, 0xffff, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -71,10 +72,11 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: s_and_b32 s1, s1, 0x7f +; GFX9-NEXT: s_and_b32 s3, 0xffff, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -100,7 +102,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX10-NEXT: s_and_b32 s2, s2, 0x7f -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_and_b32 s1, s1, 0x7f ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -111,6 +112,8 @@ ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10-NEXT: s_and_b32 s2, 0xffff, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -130,7 +133,6 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7f -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_and_b32 s1, s1, 0x7f ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -146,23 +148,26 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_and_b32 s2, 0xffff, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u16 v1, 6, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX11-NEXT: v_lshrrev_b16 v0, v0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b16 v1, v1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -347,10 +352,11 @@ ; ; GFX8-LABEL: s_fshr_i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s3, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 @@ -359,10 +365,11 @@ ; ; GFX9-LABEL: s_fshr_i8: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s3, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, s3 @@ -371,10 +378,11 @@ ; ; GFX10-LABEL: s_fshr_i8: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_and_b32 s3, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 @@ -383,10 +391,11 @@ ; ; GFX11-LABEL: s_fshr_i8: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_and_b32 s3, s2, 7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s4 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, s3 @@ -477,37 +486,41 @@ ; GFX8-LABEL: s_fshr_i8_4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s2, 0xffff, 4 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i8_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s2, 0xffff, 4 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_lshr_b32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i8_4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_and_b32 s2, 0xffff, 4 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 4 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i8_4: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_and_b32 s2, 0xffff, 4 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s1, s1, 4 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4) @@ -573,38 +586,46 @@ ; ; GFX8-LABEL: s_fshr_i8_5: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s2, 0xffff, 3 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: s_lshr_b32 s1, s1, 5 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 5 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i8_5: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s2, 0xffff, 3 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 3 -; GFX9-NEXT: s_lshr_b32 s1, s1, 5 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 5 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i8_5: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: s_and_b32 s2, 0xffff, 3 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 5 +; GFX10-NEXT: s_and_b32 s3, 0xffff, 5 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i8_5: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, 3 +; GFX11-NEXT: s_and_b32 s2, 0xffff, 3 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s1, s1, 5 +; GFX11-NEXT: s_and_b32 s3, 0xffff, 5 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5) @@ -687,17 +708,18 @@ ; ; GFX8-LABEL: s_fshr_v2i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s7, 0xffff, 1 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 ; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_lshl_b32 s3, s3, s7 ; GFX8-NEXT: s_lshr_b32 s1, s1, s6 ; GFX8-NEXT: s_lshl_b32 s2, s3, s2 ; GFX8-NEXT: s_and_b32 s3, s4, 0xff @@ -707,24 +729,26 @@ ; GFX8-NEXT: s_lshr_b32 s1, s3, s1 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s2, 0xffff, 8 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v2i8: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s7, 0xffff, 1 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 ; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_lshl_b32 s3, s3, s7 ; GFX9-NEXT: s_lshr_b32 s1, s1, s6 ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 ; GFX9-NEXT: s_and_b32 s3, s4, 0xff @@ -734,25 +758,27 @@ ; GFX9-NEXT: s_lshr_b32 s1, s3, s1 ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s2, 0xffff, 8 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 +; GFX10-NEXT: s_and_b32 s7, 0xffff, 1 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s5, s2, 8 ; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s7 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_lshl_b32 s3, s3, s7 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s3, s3, s5 @@ -761,25 +787,27 @@ ; GFX10-NEXT: s_or_b32 s2, s3, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff +; GFX10-NEXT: s_and_b32 s2, 0xffff, 8 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_v2i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s4, s1, 8 +; GFX11-NEXT: s_and_b32 s7, 0xffff, 1 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 ; GFX11-NEXT: s_lshr_b32 s5, s2, 8 ; GFX11-NEXT: s_and_b32 s6, s2, 7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s7 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s5, 7 ; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 -; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_lshl_b32 s3, s3, s7 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s3, s3, s5 @@ -788,8 +816,9 @@ ; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_and_b32 s2, 0xffff, 8 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_lshl_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -997,6 +1026,7 @@ ; ; GFX8-LABEL: s_fshr_v4i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s13, 0xffff, 1 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 @@ -1008,12 +1038,12 @@ ; GFX8-NEXT: s_lshr_b32 s11, s2, 24 ; GFX8-NEXT: s_and_b32 s12, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s13 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_lshl_b32 s3, s3, s13 ; GFX8-NEXT: s_lshr_b32 s1, s1, s12 ; GFX8-NEXT: s_lshl_b32 s2, s3, s2 ; GFX8-NEXT: s_and_b32 s3, s6, 0xff @@ -1022,7 +1052,7 @@ ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s3, s1 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 -; GFX8-NEXT: s_lshl_b32 s4, s4, 1 +; GFX8-NEXT: s_lshl_b32 s4, s4, s13 ; GFX8-NEXT: s_lshl_b32 s3, s4, s3 ; GFX8-NEXT: s_and_b32 s4, s7, 0xff ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -1033,7 +1063,7 @@ ; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: s_and_b32 s3, s11, 7 ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 -; GFX8-NEXT: s_lshl_b32 s5, s5, 1 +; GFX8-NEXT: s_lshl_b32 s5, s5, s13 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshl_b32 s4, s5, s4 @@ -1050,6 +1080,7 @@ ; ; GFX9-LABEL: s_fshr_v4i8: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s13, 0xffff, 1 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 @@ -1061,12 +1092,12 @@ ; GFX9-NEXT: s_lshr_b32 s11, s2, 24 ; GFX9-NEXT: s_and_b32 s12, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s13 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_lshl_b32 s3, s3, s13 ; GFX9-NEXT: s_lshr_b32 s1, s1, s12 ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 ; GFX9-NEXT: s_and_b32 s3, s6, 0xff @@ -1075,7 +1106,7 @@ ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s3, s1 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 -; GFX9-NEXT: s_lshl_b32 s4, s4, 1 +; GFX9-NEXT: s_lshl_b32 s4, s4, s13 ; GFX9-NEXT: s_lshl_b32 s3, s4, s3 ; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s2, s1 @@ -1086,7 +1117,7 @@ ; GFX9-NEXT: s_or_b32 s2, s3, s2 ; GFX9-NEXT: s_and_b32 s3, s11, 7 ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 -; GFX9-NEXT: s_lshl_b32 s5, s5, 1 +; GFX9-NEXT: s_lshl_b32 s5, s5, s13 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_lshl_b32 s4, s5, s4 @@ -1104,6 +1135,7 @@ ; GFX10-LABEL: s_fshr_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_and_b32 s13, 0xffff, 1 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 @@ -1115,13 +1147,13 @@ ; GFX10-NEXT: s_and_b32 s12, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s13 ; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s9, 7 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 -; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_lshl_b32 s3, s3, s13 ; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_lshr_b32 s1, s1, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, s9 @@ -1131,12 +1163,12 @@ ; GFX10-NEXT: s_or_b32 s1, s3, s2 ; GFX10-NEXT: s_and_b32 s2, s10, 7 ; GFX10-NEXT: s_andn2_b32 s3, 7, s10 -; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: s_lshl_b32 s4, s4, s13 ; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshr_b32 s2, s6, s2 ; GFX10-NEXT: s_andn2_b32 s4, 7, s11 -; GFX10-NEXT: s_lshl_b32 s5, s5, 1 +; GFX10-NEXT: s_lshl_b32 s5, s5, s13 ; GFX10-NEXT: s_and_b32 s6, s11, 7 ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s8, s6 @@ -1157,6 +1189,7 @@ ; GFX11-LABEL: s_fshr_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-NEXT: s_and_b32 s13, 0xffff, 1 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s0, 24 @@ -1168,13 +1201,13 @@ ; GFX11-NEXT: s_and_b32 s12, s2, 7 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s13 ; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: s_and_b32 s2, s9, 7 ; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 -; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_lshl_b32 s3, s3, s13 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_lshr_b32 s1, s1, s12 ; GFX11-NEXT: s_lshl_b32 s3, s3, s9 @@ -1184,12 +1217,12 @@ ; GFX11-NEXT: s_or_b32 s1, s3, s2 ; GFX11-NEXT: s_and_b32 s2, s10, 7 ; GFX11-NEXT: s_and_not1_b32 s3, 7, s10 -; GFX11-NEXT: s_lshl_b32 s4, s4, 1 +; GFX11-NEXT: s_lshl_b32 s4, s4, s13 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_lshl_b32 s3, s4, s3 ; GFX11-NEXT: s_lshr_b32 s2, s6, s2 ; GFX11-NEXT: s_and_not1_b32 s4, 7, s11 -; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: s_lshl_b32 s5, s5, s13 ; GFX11-NEXT: s_and_b32 s6, s11, 7 ; GFX11-NEXT: s_lshl_b32 s4, s5, s4 ; GFX11-NEXT: s_lshr_b32 s5, s8, s6 @@ -1945,10 +1978,11 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 +; GFX8-NEXT: s_and_b32 s10, 0xffff, 8 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s10 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 @@ -1956,28 +1990,28 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: s_and_b32 s7, s9, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s2, 16 -; GFX8-NEXT: s_lshr_b32 s10, s2, 24 +; GFX8-NEXT: s_lshr_b32 s11, s2, 24 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff -; GFX8-NEXT: s_lshl_b32 s8, s8, 8 +; GFX8-NEXT: s_lshl_b32 s8, s8, s10 ; GFX8-NEXT: s_or_b32 s2, s2, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX8-NEXT: s_lshr_b32 s11, s3, 8 +; GFX8-NEXT: s_lshr_b32 s12, s3, 8 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_lshl_b32 s3, s3, 8 -; GFX8-NEXT: s_and_b32 s8, s11, 0xff -; GFX8-NEXT: s_or_b32 s3, s10, s3 +; GFX8-NEXT: s_lshl_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s8, s12, 0xff +; GFX8-NEXT: s_or_b32 s3, s11, s3 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 @@ -1986,9 +2020,9 @@ ; GFX8-NEXT: s_lshr_b32 s8, s4, 8 ; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_lshr_b32 s10, s4, 24 +; GFX8-NEXT: s_lshr_b32 s11, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff -; GFX8-NEXT: s_lshl_b32 s8, s8, 8 +; GFX8-NEXT: s_lshl_b32 s8, s8, s10 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 @@ -2001,18 +2035,18 @@ ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: s_lshr_b32 s11, s5, 8 +; GFX8-NEXT: s_lshr_b32 s12, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v2, v1 -; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_lshl_b32 s5, s5, s10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: s_and_b32 s8, s11, 0xff +; GFX8-NEXT: s_and_b32 s8, s12, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: s_or_b32 s5, s10, s5 +; GFX8-NEXT: s_or_b32 s5, s11, s5 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 @@ -2073,42 +2107,43 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s12, 0xffff, 8 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX9-NEXT: s_lshl_b32 s1, s1, s12 ; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 -; GFX9-NEXT: s_lshr_b32 s10, s2, 8 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: s_lshr_b32 s10, s2, 8 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s7 ; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_and_b32 s9, s11, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s2, 16 -; GFX9-NEXT: s_lshr_b32 s12, s2, 24 +; GFX9-NEXT: s_lshr_b32 s13, s2, 24 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_lshl_b32 s10, s10, s12 ; GFX9-NEXT: s_or_b32 s2, s2, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s13, s3, 8 +; GFX9-NEXT: s_lshr_b32 s14, s3, 8 ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: s_or_b32 s2, s2, s10 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_and_b32 s10, s13, 0xff -; GFX9-NEXT: s_or_b32 s3, s12, s3 +; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s10, s14, 0xff +; GFX9-NEXT: s_or_b32 s3, s13, s3 ; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -2118,9 +2153,9 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 24 +; GFX9-NEXT: s_lshr_b32 s13, s4, 24 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_lshl_b32 s10, s10, s12 ; GFX9-NEXT: s_or_b32 s4, s4, s10 ; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 @@ -2129,12 +2164,12 @@ ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s10 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: s_lshr_b32 s13, s5, 8 +; GFX9-NEXT: s_lshr_b32 s14, s5, 8 ; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_and_b32 s10, s13, 0xff -; GFX9-NEXT: s_or_b32 s5, s12, s5 +; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: s_and_b32 s10, s14, 0xff +; GFX9-NEXT: s_or_b32 s5, s13, s5 ; GFX9-NEXT: s_and_b32 s10, 0xffff, s10 ; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -2198,104 +2233,105 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_and_b32 s10, 0xffff, 8 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, s10 ; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_or_b32 s1, s8, s1 ; GFX10-NEXT: s_lshr_b32 s8, s4, 8 ; GFX10-NEXT: s_lshr_b32 s7, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: s_lshl_b32 s6, s6, s10 ; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s6, s7, 0xff ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff ; GFX10-NEXT: s_and_b32 s7, s9, 0xff ; GFX10-NEXT: s_lshr_b32 s9, s4, 16 -; GFX10-NEXT: s_lshr_b32 s10, s4, 24 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 +; GFX10-NEXT: s_lshr_b32 s11, s4, 24 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshr_b32 s11, s5, 8 +; GFX10-NEXT: s_lshl_b32 s8, s8, s10 +; GFX10-NEXT: s_lshr_b32 s12, s5, 8 ; GFX10-NEXT: s_or_b32 s4, s4, s8 ; GFX10-NEXT: s_and_b32 s8, s9, 0xff -; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_and_b32 s5, s5, 0xff ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 ; GFX10-NEXT: s_or_b32 s4, s4, s8 -; GFX10-NEXT: s_and_b32 s8, s11, 0xff ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: s_or_b32 s5, s10, s5 +; GFX10-NEXT: s_and_b32 s8, s12, 0xff +; GFX10-NEXT: s_or_b32 s5, s11, s5 ; GFX10-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: s_lshl_b32 s8, s8, 16 ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: s_or_b32 s5, s5, s8 ; GFX10-NEXT: s_lshr_b32 s8, s2, 16 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX10-NEXT: s_and_b32 s9, s9, 0xff -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_lshr_b32 s11, s3, 8 +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_lshr_b32 s12, s3, 8 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_lshl_b32 s9, s9, s10 ; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: s_or_b32 s2, s2, s9 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_or_b32 s2, s2, s9 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s8 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 ; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s5, s11, 0xff -; GFX10-NEXT: s_or_b32 s3, s10, s3 +; GFX10-NEXT: s_and_b32 s5, s12, 0xff +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_or_b32 s3, s11, s3 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_or_b32 s3, s3, s5 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX10-NEXT: s_lshl_b32 s4, s6, 17 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: s_or_b32 s0, s4, s0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_lshl_b32 s1, s1, 1 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_lshl_b32 s2, s7, 17 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s3 @@ -2322,95 +2358,96 @@ ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 ; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: s_and_b32 s9, 0xffff, 8 ; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_lshr_b32 s8, s0, 24 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 -; GFX11-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-NEXT: s_lshl_b32 s6, s6, s9 +; GFX11-NEXT: s_lshr_b32 s10, s1, 8 ; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s6, s7, 0xff -; GFX11-NEXT: s_and_b32 s7, s9, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s4, 8 +; GFX11-NEXT: s_and_b32 s7, s10, 0xff ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1 -; GFX11-NEXT: s_lshr_b32 s10, s4, 16 -; GFX11-NEXT: s_and_b32 s9, s9, 0xff -; GFX11-NEXT: s_and_b32 s11, s4, 0xff +; GFX11-NEXT: s_lshr_b32 s10, s4, 8 +; GFX11-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: s_lshl_b32 s9, s9, 8 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_or_b32 s9, s11, s9 +; GFX11-NEXT: s_and_b32 s12, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s10, s9 +; GFX11-NEXT: s_and_b32 s11, s11, 0xff ; GFX11-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 +; GFX11-NEXT: s_or_b32 s10, s12, s10 ; GFX11-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 +; GFX11-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX11-NEXT: s_and_b32 s10, 0xffff, s10 -; GFX11-NEXT: s_and_b32 s9, 0xffff, s9 -; GFX11-NEXT: s_lshl_b32 s10, s10, 16 -; GFX11-NEXT: s_lshr_b32 s11, s5, 8 -; GFX11-NEXT: s_or_b32 s9, s9, s10 -; GFX11-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-NEXT: s_lshl_b32 s11, s11, 16 +; GFX11-NEXT: s_lshr_b32 s12, s5, 8 +; GFX11-NEXT: s_or_b32 s10, s10, s11 ; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX11-NEXT: s_and_b32 s5, s5, 0xff ; GFX11-NEXT: s_lshr_b32 s4, s4, 24 -; GFX11-NEXT: s_lshl_b32 s5, s5, 8 -; GFX11-NEXT: s_and_b32 s10, s11, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s5, s9 +; GFX11-NEXT: s_and_b32 s11, s12, 0xff ; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s10 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s11 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX11-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s10, s2, 16 -; GFX11-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_lshr_b32 s5, s2, 24 +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: v_mul_hi_u32 v0, s10, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, s9 +; GFX11-NEXT: s_lshr_b32 s11, s2, 16 ; GFX11-NEXT: s_or_b32 s1, s8, s1 ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX11-NEXT: s_lshr_b32 s8, s2, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshr_b32 s5, s2, 24 ; GFX11-NEXT: s_and_b32 s8, s8, 0xff ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: v_mul_hi_u32 v1, s4, v1 -; GFX11-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s8, s9 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: s_or_b32 s2, s2, s8 -; GFX11-NEXT: s_and_b32 s8, s10, 0xff +; GFX11-NEXT: s_and_b32 s8, s11, 0xff ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX11-NEXT: v_sub_nc_u32_e32 v0, s9, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s10, v0 ; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX11-NEXT: s_lshr_b32 s9, s3, 8 +; GFX11-NEXT: s_and_b32 s8, 0xffff, s8 +; GFX11-NEXT: s_lshr_b32 s10, s3, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s8 +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_lshl_b32 s3, s3, s9 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX11-NEXT: s_and_b32 s4, s9, 0xff +; GFX11-NEXT: s_and_b32 s4, s10, 0xff ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: s_or_b32 s2, s2, s8 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_or_b32 s3, s5, s3 ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_or_b32 s3, s5, s3 ; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshl_b32 s4, s4, 16 ; GFX11-NEXT: s_lshl_b32 s5, s6, 17 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: s_or_b32 s0, s5, s0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_or_b32 s0, s5, s0 ; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX11-NEXT: s_lshl_b32 s1, s1, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_lshl_b32 s1, s1, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 @@ -3099,7 +3136,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s3, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 @@ -3112,7 +3150,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s3, s2, 15 ; GFX9-NEXT: s_andn2_b32 s2, 15, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s4, 0xffff, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 @@ -3124,8 +3163,9 @@ ; GFX10-LABEL: s_fshr_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s3, s2, 15 +; GFX10-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX10-NEXT: s_andn2_b32 s2, 15, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 ; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 @@ -3137,8 +3177,9 @@ ; GFX11-LABEL: s_fshr_i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s3, s2, 15 +; GFX11-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX11-NEXT: s_and_not1_b32 s2, 15, s2 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s4 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 @@ -3161,33 +3202,41 @@ ; ; GFX8-LABEL: s_fshr_i16_4: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s2, 0xffff, 12 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 12 -; GFX8-NEXT: s_lshr_b32 s1, s1, 4 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 4 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i16_4: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s2, 0xffff, 12 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 12 -; GFX9-NEXT: s_lshr_b32 s1, s1, 4 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 4 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i16_4: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s2, 0xffff, 12 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 12 -; GFX10-NEXT: s_lshr_b32 s1, s1, 4 +; GFX10-NEXT: s_and_b32 s3, 0xffff, 4 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i16_4: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s2, 0xffff, 12 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 12 -; GFX11-NEXT: s_lshr_b32 s1, s1, 4 +; GFX11-NEXT: s_and_b32 s3, 0xffff, 4 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3205,33 +3254,41 @@ ; ; GFX8-LABEL: s_fshr_i16_5: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s2, 0xffff, 11 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 11 -; GFX8-NEXT: s_lshr_b32 s1, s1, 5 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 5 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i16_5: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s2, 0xffff, 11 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 11 -; GFX9-NEXT: s_lshr_b32 s1, s1, 5 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 5 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i16_5: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s2, 0xffff, 11 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, 11 -; GFX10-NEXT: s_lshr_b32 s1, s1, 5 +; GFX10-NEXT: s_and_b32 s3, 0xffff, 5 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i16_5: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s2, 0xffff, 11 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 11 -; GFX11-NEXT: s_lshr_b32 s1, s1, 5 +; GFX11-NEXT: s_and_b32 s3, 0xffff, 5 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -3417,8 +3474,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 1 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3428,8 +3486,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 1 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3439,7 +3498,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1 ; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0 @@ -3450,12 +3510,14 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_and_b32 s2, 0xffff, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 ; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b16 v0, v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b16 v1, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) @@ -3481,7 +3543,8 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s2, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, s2, v0 @@ -3492,7 +3555,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s2, s1, 15 ; GFX9-NEXT: s_andn2_b32 s1, 15, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s3, 0xffff, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: v_lshrrev_b16_e32 v0, s2, v0 @@ -3502,9 +3566,10 @@ ; GFX10-LABEL: v_fshr_i16_svs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s2, s1, 15 +; GFX10-NEXT: s_and_b32 s3, 0xffff, 1 ; GFX10-NEXT: s_andn2_b32 s1, 15, s1 ; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 @@ -3513,9 +3578,10 @@ ; GFX11-LABEL: v_fshr_i16_svs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s2, s1, 15 +; GFX11-NEXT: s_and_b32 s3, 0xffff, 1 ; GFX11-NEXT: s_and_not1_b32 s1, 15, s1 ; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s0, s0, s1 @@ -3600,14 +3666,16 @@ ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NEXT: s_or_b32 s4, s5, s4 -; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_lshr_b32 s5, s5, 14 -; GFX6-NEXT: s_or_b32 s0, s0, s5 +; GFX6-NEXT: s_and_b32 s5, 0xffff, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s5 +; GFX6-NEXT: s_bfe_u32 s6, s2, 0xf0001 +; GFX6-NEXT: s_and_b32 s7, 0xffff, 14 +; GFX6-NEXT: s_lshl_b32 s1, s1, s5 ; GFX6-NEXT: s_bfe_u32 s5, s3, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_lshr_b32 s5, s5, 14 +; GFX6-NEXT: s_lshr_b32 s6, s6, s7 +; GFX6-NEXT: s_lshr_b32 s5, s5, s7 ; GFX6-NEXT: s_xor_b32 s4, s4, -1 +; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_or_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: s_lshr_b32 s5, s4, 16 @@ -3636,34 +3704,36 @@ ; ; GFX8-LABEL: s_fshr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s5, 0xffff, 1 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s7, 0xffff, 15 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_lshr_b32 s5, s5, 15 -; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s5, s4, 15 -; GFX8-NEXT: s_lshl_b32 s1, s1, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s5 +; GFX8-NEXT: s_lshr_b32 s6, s6, s7 +; GFX8-NEXT: s_or_b32 s0, s0, s6 +; GFX8-NEXT: s_lshl_b32 s3, s3, s5 +; GFX8-NEXT: s_lshr_b32 s6, s4, s7 +; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_xor_b32 s2, s2, -1 -; GFX8-NEXT: s_or_b32 s3, s3, s5 -; GFX8-NEXT: s_lshr_b32 s5, s2, 16 -; GFX8-NEXT: s_and_b32 s6, s2, 15 +; GFX8-NEXT: s_or_b32 s3, s3, s6 +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_and_b32 s7, s2, 15 ; GFX8-NEXT: s_andn2_b32 s2, 15, s2 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX8-NEXT: s_lshr_b32 s1, s1, s5 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 +; GFX8-NEXT: s_lshl_b32 s0, s0, s7 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s5, 15 -; GFX8-NEXT: s_lshl_b32 s4, s4, 1 +; GFX8-NEXT: s_and_b32 s1, s6, 15 +; GFX8-NEXT: s_lshl_b32 s4, s4, s5 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_andn2_b32 s2, 15, s5 +; GFX8-NEXT: s_andn2_b32 s2, 15, s6 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s3, s3, s5 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 @@ -3751,13 +3821,15 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX6-NEXT: v_bfe_u32 v5, v2, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5 +; GFX6-NEXT: s_and_b32 s5, 0xffff, 14 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, s5, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX6-NEXT: v_bfe_u32 v5, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, s5, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 @@ -3862,13 +3934,17 @@ ; GFX6-LABEL: v_fshr_v2i16_4_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s4, 0xffff, 12 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 12, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 3, v2 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 7, v2 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 7 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3924,14 +4000,16 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_bfe_u32 s4, s2, 0xf0001 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 1 +; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001 +; GFX6-NEXT: s_and_b32 s6, 0xffff, 14 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_lshr_b32 s4, s4, 14 +; GFX6-NEXT: s_lshl_b32 s0, s0, s4 +; GFX6-NEXT: s_lshr_b32 s5, s5, s6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_or_b32 s0, s0, s4 +; GFX6-NEXT: s_or_b32 s0, s0, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -3939,12 +4017,12 @@ ; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_lshr_b32 s4, s4, 14 +; GFX6-NEXT: s_lshr_b32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: s_or_b32 s1, s1, s4 @@ -3962,32 +4040,34 @@ ; ; GFX8-LABEL: v_fshr_v2i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s4, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, 1 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s6, 0xffff, 15 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_lshr_b32 s4, s4, 15 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_lshr_b32 s5, s5, s6 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_lshl_b32 s1, s1, 1 +; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 ; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: s_lshr_b32 s4, s3, 15 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 +; GFX8-NEXT: s_lshr_b32 s5, s3, s6 +; GFX8-NEXT: s_lshl_b32 s3, s3, s4 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, 1 +; GFX8-NEXT: s_lshl_b32 s2, s2, s4 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s3 -; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 @@ -4046,16 +4126,18 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_bfe_u32 v2, v0, 1, 15 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 14, v2 -; GFX6-NEXT: v_bfe_u32 v3, v1, 1, 15 ; GFX6-NEXT: s_or_b32 s2, s3, s2 +; GFX6-NEXT: s_and_b32 s3, 0xffff, 1 +; GFX6-NEXT: v_bfe_u32 v2, v0, 1, 15 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 14 +; GFX6-NEXT: s_lshl_b32 s0, s0, s3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_bfe_u32 v3, v1, 1, 15 ; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_lshl_b32 s0, s1, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3 +; GFX6-NEXT: s_lshl_b32 s0, s1, s3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3 ; GFX6-NEXT: v_or_b32_e32 v3, s0, v3 ; GFX6-NEXT: s_xor_b32 s0, s2, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 @@ -4085,12 +4167,13 @@ ; ; GFX8-LABEL: v_fshr_v2i16_svs: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s3, 0xffff, 1 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 15 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NEXT: s_lshl_b32 s0, s2, 1 +; GFX8-NEXT: s_lshl_b32 s0, s2, s3 ; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 @@ -4178,14 +4261,16 @@ ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_or_b32 s2, s3, s2 -; GFX6-NEXT: s_bfe_u32 s3, s0, 0xf0001 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_lshr_b32 s3, s3, 14 -; GFX6-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_and_b32 s3, 0xffff, 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_bfe_u32 s4, s0, 0xf0001 +; GFX6-NEXT: s_and_b32 s5, 0xffff, 14 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s3, v1 ; GFX6-NEXT: s_bfe_u32 s3, s1, 0xf0001 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: s_lshr_b32 s3, s3, 14 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: s_lshr_b32 s3, s3, s5 ; GFX6-NEXT: s_xor_b32 s2, s2, -1 +; GFX6-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshr_b32 s3, s2, 16 @@ -4215,31 +4300,33 @@ ; GFX8-LABEL: v_fshr_v2i16_vss: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s3, 0xffff, s0 +; GFX8-NEXT: s_and_b32 s4, 0xffff, 15 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v0 -; GFX8-NEXT: s_lshr_b32 s3, s3, 15 +; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: s_lshr_b32 s3, s2, 15 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_xor_b32 s1, s1, -1 +; GFX8-NEXT: s_lshr_b32 s3, s2, s4 ; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_and_b32 s4, s1, 15 +; GFX8-NEXT: s_and_b32 s3, 0xffff, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_xor_b32 s1, s1, -1 +; GFX8-NEXT: s_lshr_b32 s4, s1, 16 +; GFX8-NEXT: s_and_b32 s5, s1, 15 ; GFX8-NEXT: s_andn2_b32 s1, 15, s1 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s5, v1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s2, s2, 1 +; GFX8-NEXT: s_lshl_b32 s2, s2, s3 ; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 -; GFX8-NEXT: s_and_b32 s0, s3, 15 -; GFX8-NEXT: s_andn2_b32 s1, 15, s3 +; GFX8-NEXT: s_and_b32 s0, s4, 15 +; GFX8-NEXT: s_andn2_b32 s1, 15, s4 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, s0, v0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s3 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 @@ -4307,28 +4394,30 @@ ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_or_b32 s6, s6, s7 ; GFX6-NEXT: s_and_b32 s7, s8, 0xffff -; GFX6-NEXT: s_bfe_u32 s8, s3, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_lshr_b32 s8, s8, 14 -; GFX6-NEXT: s_or_b32 s0, s0, s8 -; GFX6-NEXT: s_bfe_u32 s8, s4, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_lshr_b32 s8, s8, 14 +; GFX6-NEXT: s_and_b32 s8, 0xffff, 1 +; GFX6-NEXT: s_bfe_u32 s9, s3, 0xf0001 +; GFX6-NEXT: s_and_b32 s10, 0xffff, 14 +; GFX6-NEXT: s_lshl_b32 s0, s0, s8 +; GFX6-NEXT: s_lshr_b32 s9, s9, s10 +; GFX6-NEXT: s_or_b32 s0, s0, s9 +; GFX6-NEXT: s_bfe_u32 s9, s4, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s1, s1, s8 +; GFX6-NEXT: s_lshr_b32 s9, s9, s10 ; GFX6-NEXT: s_xor_b32 s6, s6, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s8 +; GFX6-NEXT: s_or_b32 s1, s1, s9 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 -; GFX6-NEXT: s_lshr_b32 s8, s6, 16 -; GFX6-NEXT: s_and_b32 s9, s6, 15 +; GFX6-NEXT: s_lshr_b32 s9, s6, 16 +; GFX6-NEXT: s_and_b32 s11, s6, 15 ; GFX6-NEXT: s_andn2_b32 s6, 15, s6 -; GFX6-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX6-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 ; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX6-NEXT: s_lshl_b32 s0, s0, s9 +; GFX6-NEXT: s_lshl_b32 s0, s0, s11 ; GFX6-NEXT: s_lshr_b32 s3, s3, s6 ; GFX6-NEXT: s_or_b32 s0, s0, s3 -; GFX6-NEXT: s_and_b32 s3, s8, 15 +; GFX6-NEXT: s_and_b32 s3, s9, 15 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_andn2_b32 s6, 15, s8 +; GFX6-NEXT: s_andn2_b32 s6, 15, s9 ; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s1, s1, s3 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 @@ -4336,8 +4425,8 @@ ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s2, s2, 1 -; GFX6-NEXT: s_lshr_b32 s3, s3, 14 +; GFX6-NEXT: s_lshl_b32 s2, s2, s8 +; GFX6-NEXT: s_lshr_b32 s3, s3, s10 ; GFX6-NEXT: s_xor_b32 s4, s7, -1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s5, 1 @@ -4358,49 +4447,51 @@ ; ; GFX8-LABEL: s_fshr_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s8, 0xffff, 1 +; GFX8-NEXT: s_and_b32 s9, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s10, 0xffff, 15 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_lshr_b32 s8, s8, 15 -; GFX8-NEXT: s_or_b32 s0, s0, s8 -; GFX8-NEXT: s_lshl_b32 s6, s6, 1 -; GFX8-NEXT: s_lshr_b32 s8, s7, 15 -; GFX8-NEXT: s_lshl_b32 s2, s2, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_lshr_b32 s9, s9, s10 +; GFX8-NEXT: s_or_b32 s0, s0, s9 +; GFX8-NEXT: s_lshl_b32 s6, s6, s8 +; GFX8-NEXT: s_lshr_b32 s9, s7, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, s8 ; GFX8-NEXT: s_xor_b32 s4, s4, -1 -; GFX8-NEXT: s_or_b32 s6, s6, s8 -; GFX8-NEXT: s_lshr_b32 s8, s4, 16 -; GFX8-NEXT: s_and_b32 s9, s4, 15 +; GFX8-NEXT: s_or_b32 s6, s6, s9 +; GFX8-NEXT: s_lshr_b32 s9, s4, 16 +; GFX8-NEXT: s_and_b32 s11, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s11, 0xffff, s11 +; GFX8-NEXT: s_lshr_b32 s2, s2, s8 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s9 +; GFX8-NEXT: s_lshl_b32 s0, s0, s11 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s8, 15 -; GFX8-NEXT: s_lshl_b32 s7, s7, 1 +; GFX8-NEXT: s_and_b32 s2, s9, 15 +; GFX8-NEXT: s_lshl_b32 s7, s7, s8 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_andn2_b32 s4, 15, s8 +; GFX8-NEXT: s_andn2_b32 s4, 15, s9 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s7 -; GFX8-NEXT: s_lshr_b32 s6, s6, 1 +; GFX8-NEXT: s_lshr_b32 s6, s6, s8 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s3 ; GFX8-NEXT: s_and_b32 s5, s5, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 1 -; GFX8-NEXT: s_lshr_b32 s4, s4, 15 +; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_lshr_b32 s4, s4, s10 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_lshl_b32 s3, s3, s8 ; GFX8-NEXT: s_xor_b32 s4, s5, -1 ; GFX8-NEXT: s_and_b32 s5, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s3, s3, s8 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 @@ -4561,13 +4652,15 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX6-NEXT: v_bfe_u32 v8, v3, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8 +; GFX6-NEXT: s_and_b32 s5, 0xffff, 14 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, s5, v8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX6-NEXT: v_bfe_u32 v8, v4, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, s5, v8 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v6 @@ -4592,8 +4685,8 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 14, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, s5, v3 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v7 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v5 @@ -4728,28 +4821,30 @@ ; GFX6-NEXT: s_lshl_b32 s9, s11, 16 ; GFX6-NEXT: s_and_b32 s10, s10, 0xffff ; GFX6-NEXT: s_or_b32 s9, s9, s10 -; GFX6-NEXT: s_bfe_u32 s10, s4, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_lshr_b32 s10, s10, 14 -; GFX6-NEXT: s_or_b32 s0, s0, s10 -; GFX6-NEXT: s_bfe_u32 s10, s5, 0xf0001 -; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_lshr_b32 s10, s10, 14 +; GFX6-NEXT: s_and_b32 s10, 0xffff, 1 +; GFX6-NEXT: s_bfe_u32 s11, s4, 0xf0001 +; GFX6-NEXT: s_and_b32 s12, 0xffff, 14 +; GFX6-NEXT: s_lshl_b32 s0, s0, s10 +; GFX6-NEXT: s_lshr_b32 s11, s11, s12 +; GFX6-NEXT: s_or_b32 s0, s0, s11 +; GFX6-NEXT: s_bfe_u32 s11, s5, 0xf0001 +; GFX6-NEXT: s_lshl_b32 s1, s1, s10 +; GFX6-NEXT: s_lshr_b32 s11, s11, s12 ; GFX6-NEXT: s_xor_b32 s8, s8, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s10 +; GFX6-NEXT: s_or_b32 s1, s1, s11 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_lshr_b32 s10, s8, 16 -; GFX6-NEXT: s_and_b32 s11, s8, 15 +; GFX6-NEXT: s_lshr_b32 s11, s8, 16 +; GFX6-NEXT: s_and_b32 s13, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_and_b32 s11, 0xffff, s11 +; GFX6-NEXT: s_and_b32 s13, 0xffff, s13 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 ; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX6-NEXT: s_lshl_b32 s0, s0, s11 +; GFX6-NEXT: s_lshl_b32 s0, s0, s13 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s4, s10, 15 +; GFX6-NEXT: s_and_b32 s4, s11, 15 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_andn2_b32 s8, 15, s10 +; GFX6-NEXT: s_andn2_b32 s8, 15, s11 ; GFX6-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 @@ -4760,13 +4855,13 @@ ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_lshl_b32 s1, s2, 1 +; GFX6-NEXT: s_lshl_b32 s1, s2, s10 ; GFX6-NEXT: s_bfe_u32 s2, s6, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s2, s2, 14 +; GFX6-NEXT: s_lshr_b32 s2, s2, s12 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_lshl_b32 s2, s3, 1 +; GFX6-NEXT: s_lshl_b32 s2, s3, s10 ; GFX6-NEXT: s_bfe_u32 s3, s7, 0xf0001 -; GFX6-NEXT: s_lshr_b32 s3, s3, 14 +; GFX6-NEXT: s_lshr_b32 s3, s3, s12 ; GFX6-NEXT: s_xor_b32 s5, s9, -1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s6, 1 @@ -4796,34 +4891,36 @@ ; ; GFX8-LABEL: s_fshr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s8, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s8, 0xffff, 1 +; GFX8-NEXT: s_and_b32 s9, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s10, 0xffff, 15 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_lshr_b32 s8, s8, 15 -; GFX8-NEXT: s_or_b32 s0, s0, s8 -; GFX8-NEXT: s_lshl_b32 s6, s6, 1 -; GFX8-NEXT: s_lshr_b32 s8, s7, 15 -; GFX8-NEXT: s_lshl_b32 s2, s2, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_lshr_b32 s9, s9, s10 +; GFX8-NEXT: s_or_b32 s0, s0, s9 +; GFX8-NEXT: s_lshl_b32 s6, s6, s8 +; GFX8-NEXT: s_lshr_b32 s9, s7, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, s8 ; GFX8-NEXT: s_xor_b32 s4, s4, -1 -; GFX8-NEXT: s_or_b32 s6, s6, s8 -; GFX8-NEXT: s_lshr_b32 s8, s4, 16 -; GFX8-NEXT: s_and_b32 s9, s4, 15 +; GFX8-NEXT: s_or_b32 s6, s6, s9 +; GFX8-NEXT: s_lshr_b32 s9, s4, 16 +; GFX8-NEXT: s_and_b32 s11, s4, 15 ; GFX8-NEXT: s_andn2_b32 s4, 15, s4 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_and_b32 s9, 0xffff, s9 -; GFX8-NEXT: s_lshr_b32 s2, s2, 1 +; GFX8-NEXT: s_and_b32 s11, 0xffff, s11 +; GFX8-NEXT: s_lshr_b32 s2, s2, s8 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s9 +; GFX8-NEXT: s_lshl_b32 s0, s0, s11 ; GFX8-NEXT: s_lshr_b32 s2, s2, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s8, 15 -; GFX8-NEXT: s_lshl_b32 s7, s7, 1 +; GFX8-NEXT: s_and_b32 s2, s9, 15 +; GFX8-NEXT: s_lshl_b32 s7, s7, s8 ; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_andn2_b32 s4, 15, s8 +; GFX8-NEXT: s_andn2_b32 s4, 15, s9 ; GFX8-NEXT: s_lshl_b32 s2, s6, s2 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s7 -; GFX8-NEXT: s_lshr_b32 s6, s6, 1 +; GFX8-NEXT: s_lshr_b32 s6, s6, s8 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshr_b32 s4, s6, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s4 @@ -4834,12 +4931,12 @@ ; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: s_lshr_b32 s2, s1, 16 ; GFX8-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 1 -; GFX8-NEXT: s_lshr_b32 s6, s6, 15 +; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_lshr_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_lshl_b32 s2, s2, 1 -; GFX8-NEXT: s_lshr_b32 s6, s4, 15 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_lshr_b32 s6, s4, s10 +; GFX8-NEXT: s_lshl_b32 s3, s3, s8 ; GFX8-NEXT: s_xor_b32 s5, s5, -1 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshr_b32 s6, s5, 16 @@ -4847,18 +4944,18 @@ ; GFX8-NEXT: s_andn2_b32 s5, 15, s5 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s3, s3, s8 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, s7 ; GFX8-NEXT: s_lshr_b32 s3, s3, s5 ; GFX8-NEXT: s_or_b32 s1, s1, s3 ; GFX8-NEXT: s_and_b32 s3, s6, 15 -; GFX8-NEXT: s_lshl_b32 s4, s4, 1 +; GFX8-NEXT: s_lshl_b32 s4, s4, s8 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_andn2_b32 s5, 15, s6 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s3, s3, s8 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s3 @@ -5002,13 +5099,15 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v11 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX6-NEXT: s_and_b32 s4, 0xffff, 1 ; GFX6-NEXT: v_bfe_u32 v10, v4, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10 +; GFX6-NEXT: s_and_b32 s5, 0xffff, 14 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, s5, v10 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 ; GFX6-NEXT: v_bfe_u32 v10, v5, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, s5, v10 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 @@ -5033,12 +5132,12 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, s5, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 14, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, s5, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v9 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -207,12 +207,13 @@ ; ; GFX8-LABEL: abs_sgpr_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_ashr_i32 s0, s0, 8 -; GFX8-NEXT: s_ashr_i32 s1, s1, 8 +; GFX8-NEXT: s_ashr_i32 s0, s0, s2 +; GFX8-NEXT: s_ashr_i32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_abs_i32 s0, s0 @@ -287,15 +288,16 @@ ; ; GFX8-LABEL: abs_sgpr_v3i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_and_b32 s3, 0xffff, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, s3 +; GFX8-NEXT: s_lshl_b32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_ashr_i32 s0, s0, 8 -; GFX8-NEXT: s_ashr_i32 s1, s1, 8 -; GFX8-NEXT: s_ashr_i32 s2, s2, 8 +; GFX8-NEXT: s_ashr_i32 s0, s0, s3 +; GFX8-NEXT: s_ashr_i32 s1, s1, s3 +; GFX8-NEXT: s_ashr_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -723,23 +723,36 @@ ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s0 ; GFX1030-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030-NEXT: v_mov_b32_e32 v2, s2 ; GFX1030-NEXT: v_mov_b32_e32 v3, s3 +; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1030-NEXT: s_and_b32 s1, s0, 0x4400 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] +; GFX1030-NEXT: s_and_b32 s2, s0, 0x4200 +; GFX1030-NEXT: s_lshl_b32 s1, s1, 16 +; GFX1030-NEXT: s_and_b32 s8, s0, 0x4800 +; GFX1030-NEXT: s_or_b32 s1, s2, s1 +; GFX1030-NEXT: s_and_b32 s2, s0, 0x4600 +; GFX1030-NEXT: s_and_b32 s3, s0, 0x4500 +; GFX1030-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1030-NEXT: s_and_b32 s0, s0, 0x4700 +; GFX1030-NEXT: s_lshl_b32 s8, s8, 16 +; GFX1030-NEXT: s_or_b32 s2, s3, s2 +; GFX1030-NEXT: s_or_b32 s0, s0, s8 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 +; GFX1030-NEXT: v_mov_b32_e32 v5, s1 +; GFX1030-NEXT: v_mov_b32_e32 v6, s2 +; GFX1030-NEXT: v_mov_b32_e32 v7, s0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -750,23 +763,36 @@ ; GFX1013: ; %bb.0: ; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, s0 ; GFX1013-NEXT: v_mov_b32_e32 v1, s1 ; GFX1013-NEXT: v_mov_b32_e32 v2, s2 ; GFX1013-NEXT: v_mov_b32_e32 v3, s3 +; GFX1013-NEXT: s_mov_b32 s0, 0xffff ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 +; GFX1013-NEXT: s_and_b32 s1, s0, 0x4400 ; GFX1013-NEXT: flat_load_dword v0, v[4:5] ; GFX1013-NEXT: flat_load_dword v1, v[2:3] +; GFX1013-NEXT: s_and_b32 s2, s0, 0x4200 +; GFX1013-NEXT: s_lshl_b32 s1, s1, 16 +; GFX1013-NEXT: s_and_b32 s8, s0, 0x4800 +; GFX1013-NEXT: s_or_b32 s1, s2, s1 +; GFX1013-NEXT: s_and_b32 s2, s0, 0x4600 +; GFX1013-NEXT: s_and_b32 s3, s0, 0x4500 +; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1013-NEXT: s_and_b32 s0, s0, 0x4700 +; GFX1013-NEXT: s_lshl_b32 s8, s8, 16 +; GFX1013-NEXT: s_or_b32 s2, s3, s2 +; GFX1013-NEXT: s_or_b32 s0, s0, s8 ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 +; GFX1013-NEXT: v_mov_b32_e32 v5, s1 +; GFX1013-NEXT: v_mov_b32_e32 v6, s2 +; GFX1013-NEXT: v_mov_b32_e32 v7, s0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) @@ -942,17 +968,30 @@ ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: s_mov_b32 s4, 0xffff +; GFX1030-NEXT: s_and_b32 s5, s4, 0x4400 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX1030-NEXT: s_and_b32 s6, s4, 0x4200 +; GFX1030-NEXT: s_lshl_b32 s5, s5, 16 +; GFX1030-NEXT: s_and_b32 s8, s4, 0x4800 ; GFX1030-NEXT: flat_load_dword v2, v[0:1] +; GFX1030-NEXT: s_or_b32 s5, s6, s5 +; GFX1030-NEXT: s_and_b32 s6, s4, 0x4600 +; GFX1030-NEXT: s_and_b32 s7, s4, 0x4500 +; GFX1030-NEXT: s_lshl_b32 s6, s6, 16 +; GFX1030-NEXT: s_and_b32 s4, s4, 0x4700 +; GFX1030-NEXT: s_lshl_b32 s8, s8, 16 +; GFX1030-NEXT: s_or_b32 s6, s7, s6 +; GFX1030-NEXT: s_or_b32 s4, s4, s8 ; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 +; GFX1030-NEXT: v_mov_b32_e32 v6, s5 +; GFX1030-NEXT: v_mov_b32_e32 v7, s6 +; GFX1030-NEXT: v_mov_b32_e32 v8, s4 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -965,20 +1004,33 @@ ; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX1013-NEXT: s_mov_b32 s0, 0xffff ; GFX1013-NEXT: v_mov_b32_e32 v3, 0 +; GFX1013-NEXT: s_and_b32 s1, s0, 0x4400 +; GFX1013-NEXT: s_and_b32 s8, s0, 0x4800 +; GFX1013-NEXT: s_lshl_b32 s1, s1, 16 +; GFX1013-NEXT: s_lshl_b32 s8, s8, 16 ; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200 -; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 -; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) ; GFX1013-NEXT: v_mov_b32_e32 v0, s2 ; GFX1013-NEXT: v_mov_b32_e32 v1, s3 +; GFX1013-NEXT: s_and_b32 s2, s0, 0x4200 +; GFX1013-NEXT: s_and_b32 s3, s0, 0x4500 +; GFX1013-NEXT: s_or_b32 s1, s2, s1 ; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX1013-NEXT: s_and_b32 s2, s0, 0x4600 +; GFX1013-NEXT: s_and_b32 s0, s0, 0x4700 +; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 ; GFX1013-NEXT: flat_load_dword v2, v[0:1] +; GFX1013-NEXT: s_or_b32 s2, s3, s2 +; GFX1013-NEXT: s_or_b32 s0, s0, s8 ; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 ; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 +; GFX1013-NEXT: v_mov_b32_e32 v6, s1 +; GFX1013-NEXT: v_mov_b32_e32 v7, s2 +; GFX1013-NEXT: v_mov_b32_e32 v8, s0 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -662,6 +662,13 @@ ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_mov_b32 s2, 0xffff +; SI-NEXT: s_and_b32 s3, 0xffff, 0 +; SI-NEXT: s_and_b32 s2, s2, 0x3c00 +; SI-NEXT: s_lshl_b32 s4, s3, 16 +; SI-NEXT: s_or_b32 s4, s2, s4 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_or_b32 s5, s3, s2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -670,8 +677,8 @@ ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 -; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[4:5] +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] ; SI-NEXT: .LBB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], s[0:1] @@ -686,8 +693,8 @@ ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[2:3], s[0:1], vcc ; SI-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; SI-NEXT: s_xor_b64 s[2:3], exec, s[6:7] ; SI-NEXT: s_cbranch_execz .LBB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -696,8 +703,8 @@ ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: .LBB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 -; SI-NEXT: v_bfrev_b32_e32 v1, 60 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB6_7: @@ -885,7 +892,12 @@ ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s2, 0xffff +; SI-NEXT: s_and_b32 s4, s2, 0x3c00 +; SI-NEXT: s_and_b32 s5, 0xffff, 0 +; SI-NEXT: s_lshl_b32 s6, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s4, 16 +; SI-NEXT: s_mov_b32 s8, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -894,12 +906,14 @@ ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 -; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_wqm_b64 s[10:11], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[10:11] ; SI-NEXT: .LBB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_or_b32 s6, s4, s6 +; SI-NEXT: s_or_b32 s7, s5, s7 ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -922,8 +936,8 @@ ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] ; SI-NEXT: s_cbranch_execz .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -931,14 +945,14 @@ ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.7: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 -; SI-NEXT: v_bfrev_b32_e32 v1, 60 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB7_9: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -74,29 +74,31 @@ ; ; GFX8-LABEL: s_saddsat_i7: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 9 -; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_sext_i32_i16 s3, 0 -; GFX8-NEXT: s_max_i32 s4, s2, s3 -; GFX8-NEXT: s_min_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s1, 9 -; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s3, s0 +; GFX8-NEXT: s_sext_i32_i16 s4, 0 +; GFX8-NEXT: s_max_i32 s5, s3, s4 +; GFX8-NEXT: s_min_i32 s3, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 -; GFX8-NEXT: s_max_i32 s1, s2, s1 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 +; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_min_i32 s1, s1, s2 +; GFX8-NEXT: s_sext_i32_i16 s3, s5 +; GFX8-NEXT: s_min_i32 s1, s1, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 9 +; GFX8-NEXT: s_ashr_i32 s0, s0, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_saddsat_i7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s1, s1, 9 -; GFX9-NEXT: s_lshl_b32 s0, s0, 9 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 @@ -105,8 +107,9 @@ ; ; GFX10PLUS-LABEL: s_saddsat_i7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 +; GFX10PLUS-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -184,29 +187,31 @@ ; ; GFX8-LABEL: s_saddsat_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_sext_i32_i16 s3, 0 -; GFX8-NEXT: s_max_i32 s4, s2, s3 -; GFX8-NEXT: s_min_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s3, s0 +; GFX8-NEXT: s_sext_i32_i16 s4, 0 +; GFX8-NEXT: s_max_i32 s5, s3, s4 +; GFX8-NEXT: s_min_i32 s3, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 -; GFX8-NEXT: s_max_i32 s1, s2, s1 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 +; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_min_i32 s1, s1, s2 +; GFX8-NEXT: s_sext_i32_i16 s3, s5 +; GFX8-NEXT: s_min_i32 s1, s1, s3 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 8 +; GFX8-NEXT: s_ashr_i32 s0, s0, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_saddsat_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 @@ -215,8 +220,9 @@ ; ; GFX10PLUS-LABEL: s_saddsat_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10PLUS-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -382,44 +388,45 @@ ; ; GFX8-LABEL: s_saddsat_v2i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s4, 0xffff, 8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s5, 0 -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s0 +; GFX8-NEXT: s_sext_i32_i16 s6, 0 +; GFX8-NEXT: s_max_i32 s7, s5, s6 +; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 -; GFX8-NEXT: s_max_i32 s1, s4, s1 +; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7 +; GFX8-NEXT: s_max_i32 s1, s5, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s4, s6 -; GFX8-NEXT: s_min_i32 s1, s1, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s7 +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, 8 -; GFX8-NEXT: s_lshl_b32 s2, s3, 8 +; GFX8-NEXT: s_lshl_b32 s1, s2, s4 +; GFX8-NEXT: s_lshl_b32 s2, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s4, s3, s5 -; GFX8-NEXT: s_min_i32 s3, s3, s5 +; GFX8-NEXT: s_max_i32 s5, s3, s6 +; GFX8-NEXT: s_min_i32 s3, s3, s6 ; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s2, s3, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s3, s4 +; GFX8-NEXT: s_sext_i32_i16 s3, s5 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s1, s1, 8 -; GFX8-NEXT: s_ashr_i32 s0, s0, 8 +; GFX8-NEXT: s_ashr_i32 s1, s1, s4 +; GFX8-NEXT: s_ashr_i32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -782,46 +789,47 @@ ; ; GFX8-LABEL: s_saddsat_v4i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s8, 0xffff, 8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_sext_i32_i16 s9, 0 -; GFX8-NEXT: s_max_i32 s10, s8, s9 -; GFX8-NEXT: s_min_i32 s8, s8, s9 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_sext_i32_i16 s9, s0 +; GFX8-NEXT: s_sext_i32_i16 s10, 0 +; GFX8-NEXT: s_max_i32 s11, s9, s10 +; GFX8-NEXT: s_min_i32 s9, s9, s10 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_sub_i32 s9, 0xffff8000, s9 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 -; GFX8-NEXT: s_max_i32 s1, s8, s1 +; GFX8-NEXT: s_sub_i32 s11, 0x7fff, s11 +; GFX8-NEXT: s_max_i32 s1, s9, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s8, s10 -; GFX8-NEXT: s_min_i32 s1, s1, s8 +; GFX8-NEXT: s_sext_i32_i16 s9, s11 +; GFX8-NEXT: s_min_i32 s1, s1, s9 ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, 8 -; GFX8-NEXT: s_lshl_b32 s2, s5, 8 +; GFX8-NEXT: s_lshl_b32 s1, s2, s8 +; GFX8-NEXT: s_lshl_b32 s2, s5, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_max_i32 s8, s5, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s9 +; GFX8-NEXT: s_max_i32 s9, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8 +; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s5, s8 +; GFX8-NEXT: s_sext_i32_i16 s5, s9 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s2, s3, 8 +; GFX8-NEXT: s_lshl_b32 s2, s3, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_lshl_b32 s3, s6, 8 -; GFX8-NEXT: s_max_i32 s6, s5, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s9 +; GFX8-NEXT: s_lshl_b32 s3, s6, s8 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -831,11 +839,11 @@ ; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_min_i32 s3, s3, s5 ; GFX8-NEXT: s_add_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s3, s4, 8 +; GFX8-NEXT: s_lshl_b32 s3, s4, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s3 -; GFX8-NEXT: s_max_i32 s6, s5, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s9 -; GFX8-NEXT: s_lshl_b32 s4, s7, 8 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_lshl_b32 s4, s7, s8 ; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -843,21 +851,21 @@ ; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_max_i32 s4, s5, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s1, s1, 8 +; GFX8-NEXT: s_ashr_i32 s1, s1, s8 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s6 -; GFX8-NEXT: s_ashr_i32 s0, s0, 8 +; GFX8-NEXT: s_ashr_i32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_ashr_i32 s2, s2, 8 +; GFX8-NEXT: s_ashr_i32 s2, s2, s8 ; GFX8-NEXT: s_add_i32 s3, s3, s4 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NEXT: s_ashr_i32 s3, s3, 8 +; GFX8-NEXT: s_ashr_i32 s3, s3, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, 0xff Index: llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -47,21 +47,24 @@ ; ; GFX8-LABEL: s_sext_inreg_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_and_b32 s1, 0xffff, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i8 s0, s0 ; GFX8-NEXT: s_ashr_i32 s0, s0, 3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 3 +; GFX9-NEXT: s_and_b32 s1, 0xffff, 3 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: s_sext_i32_i8 s0, s0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, 3 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 3 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -78,21 +81,24 @@ ; ; GFX8-LABEL: s_sext_inreg_i8_6: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 6 +; GFX8-NEXT: s_and_b32 s1, 0xffff, 6 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i8 s0, s0 ; GFX8-NEXT: s_ashr_i32 s0, s0, 6 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_i8_6: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 6 +; GFX9-NEXT: s_and_b32 s1, 0xffff, 6 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: s_sext_i32_i8 s0, s0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 6 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i8_6: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 6 +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 6 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -578,21 +584,24 @@ ; ; GFX8-LABEL: s_sext_inreg_i16_9: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 9 +; GFX8-NEXT: s_and_b32 s1, 0xffff, 9 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s0, s0, 9 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_i16_9: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 9 +; GFX9-NEXT: s_and_b32 s1, 0xffff, 9 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 9 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i16_9: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, 9 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -609,21 +618,24 @@ ; ; GFX8-LABEL: s_sext_inreg_i16_15: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 15 +; GFX8-NEXT: s_and_b32 s1, 0xffff, 15 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s0, s0, 15 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_sext_inreg_i16_15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s0, s0, 15 +; GFX9-NEXT: s_and_b32 s1, 0xffff, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: s_ashr_i32 s0, s0, 15 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_sext_inreg_i16_15: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 15 +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, 15 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 ; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 15 ; GFX10PLUS-NEXT: ; return to shader part epilog @@ -720,8 +732,9 @@ ; GFX8-LABEL: s_sext_inreg_v2i16_11: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 11 -; GFX8-NEXT: s_lshl_b32 s1, s1, 11 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 11 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_ashr_i32 s0, s0, 11 @@ -854,11 +867,12 @@ ; GFX8-LABEL: s_sext_inreg_v4i16_14: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_and_b32 s4, 0xffff, 14 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 14 -; GFX8-NEXT: s_lshl_b32 s2, s2, 14 -; GFX8-NEXT: s_lshl_b32 s1, s1, 14 -; GFX8-NEXT: s_lshl_b32 s3, s3, 14 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s2, s2, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_lshl_b32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 @@ -1069,23 +1083,24 @@ ; GFX8-LABEL: s_sext_inreg_v8i16_5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_and_b32 s8, 0xffff, 5 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshl_b32 s0, s0, 5 -; GFX8-NEXT: s_lshl_b32 s4, s4, 5 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_lshl_b32 s4, s4, s8 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_lshl_b32 s1, s1, 5 -; GFX8-NEXT: s_lshl_b32 s5, s5, 5 +; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_lshl_b32 s5, s5, s8 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_lshl_b32 s2, s2, 5 -; GFX8-NEXT: s_lshl_b32 s6, s6, 5 +; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_lshl_b32 s6, s6, s8 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_ashr_i32 s0, s0, 5 ; GFX8-NEXT: s_ashr_i32 s4, s4, 5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 5 -; GFX8-NEXT: s_lshl_b32 s7, s7, 5 +; GFX8-NEXT: s_lshl_b32 s3, s3, s8 +; GFX8-NEXT: s_lshl_b32 s7, s7, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_ashr_i32 s1, s1, 5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -91,14 +91,27 @@ } define amdgpu_ps i8 @s_shl_i8_7(i8 inreg %value) { -; GCN-LABEL: s_shl_i8_7: -; GCN: ; %bb.0: -; GCN-NEXT: s_lshl_b32 s0, s0, 7 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_shl_i8_7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshl_b32 s0, s0, 7 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_shl_i8_7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, 0xffff, 7 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_shl_i8_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, 0xffff, 7 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i8_7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, 7 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i8 %value, 7 ret i8 %result @@ -647,14 +660,27 @@ } define amdgpu_ps i16 @s_shl_i16_15(i16 inreg %value) { -; GCN-LABEL: s_shl_i16_15: -; GCN: ; %bb.0: -; GCN-NEXT: s_lshl_b32 s0, s0, 15 -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_shl_i16_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshl_b32 s0, s0, 15 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_shl_i16_15: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, 0xffff, 15 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_shl_i16_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, 0xffff, 15 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i16_15: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 15 +; GFX10PLUS-NEXT: s_and_b32 s1, 0xffff, 15 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i16 %value, 15 ret i16 %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -74,29 +74,31 @@ ; ; GFX8-LABEL: s_ssubsat_i7: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 9 -; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_sext_i32_i16 s3, -1 -; GFX8-NEXT: s_max_i32 s4, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s1, 9 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff -; GFX8-NEXT: s_min_i32 s2, s2, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s4 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s3, s0 +; GFX8-NEXT: s_sext_i32_i16 s4, -1 +; GFX8-NEXT: s_max_i32 s5, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 -; GFX8-NEXT: s_max_i32 s1, s3, s1 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s4, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_min_i32 s1, s1, s2 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_min_i32 s1, s1, s3 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 9 +; GFX8-NEXT: s_ashr_i32 s0, s0, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ssubsat_i7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s1, s1, 9 -; GFX9-NEXT: s_lshl_b32 s0, s0, 9 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 @@ -105,8 +107,9 @@ ; ; GFX10PLUS-LABEL: s_ssubsat_i7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 +; GFX10PLUS-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -184,29 +187,31 @@ ; ; GFX8-LABEL: s_ssubsat_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_sext_i32_i16 s3, -1 -; GFX8-NEXT: s_max_i32 s4, s2, s3 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff -; GFX8-NEXT: s_min_i32 s2, s2, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s4 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s3, s0 +; GFX8-NEXT: s_sext_i32_i16 s4, -1 +; GFX8-NEXT: s_max_i32 s5, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 -; GFX8-NEXT: s_max_i32 s1, s3, s1 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s4, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_min_i32 s1, s1, s2 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_min_i32 s1, s1, s3 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s0, s0, 8 +; GFX8-NEXT: s_ashr_i32 s0, s0, s2 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_ssubsat_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 @@ -215,8 +220,9 @@ ; ; GFX10PLUS-LABEL: s_ssubsat_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10PLUS-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -382,44 +388,45 @@ ; ; GFX8-LABEL: s_ssubsat_v2i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s4, 0xffff, 8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s5, -1 -; GFX8-NEXT: s_max_i32 s6, s4, s5 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s0 +; GFX8-NEXT: s_sext_i32_i16 s6, -1 +; GFX8-NEXT: s_max_i32 s7, s5, s6 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff -; GFX8-NEXT: s_min_i32 s4, s4, s5 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s6 +; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 -; GFX8-NEXT: s_max_i32 s1, s6, s1 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s7, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_min_i32 s1, s1, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, 8 -; GFX8-NEXT: s_lshl_b32 s2, s3, 8 +; GFX8-NEXT: s_lshl_b32 s1, s2, s4 +; GFX8-NEXT: s_lshl_b32 s2, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s4, s3, s5 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff -; GFX8-NEXT: s_min_i32 s3, s3, s5 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_max_i32 s5, s3, s6 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s6 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 -; GFX8-NEXT: s_max_i32 s2, s4, s2 +; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s1, s1, 8 -; GFX8-NEXT: s_ashr_i32 s0, s0, 8 +; GFX8-NEXT: s_ashr_i32 s1, s1, s4 +; GFX8-NEXT: s_ashr_i32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -782,47 +789,48 @@ ; ; GFX8-LABEL: s_ssubsat_v4i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s8, 0xffff, 8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 -; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_sext_i32_i16 s9, -1 -; GFX8-NEXT: s_max_i32 s10, s8, s9 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_sext_i32_i16 s9, s0 +; GFX8-NEXT: s_sext_i32_i16 s10, -1 +; GFX8-NEXT: s_max_i32 s11, s9, s10 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff -; GFX8-NEXT: s_min_i32 s8, s8, s9 -; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_sub_i32 s11, s11, 0x7fff +; GFX8-NEXT: s_min_i32 s9, s9, s10 +; GFX8-NEXT: s_sext_i32_i16 s11, s11 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 -; GFX8-NEXT: s_max_i32 s1, s10, s1 +; GFX8-NEXT: s_sub_i32 s9, s9, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s11, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_min_i32 s1, s1, s8 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_min_i32 s1, s1, s9 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, 8 -; GFX8-NEXT: s_lshl_b32 s2, s5, 8 +; GFX8-NEXT: s_lshl_b32 s1, s2, s8 +; GFX8-NEXT: s_lshl_b32 s2, s5, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_max_i32 s8, s5, s9 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff -; GFX8-NEXT: s_min_i32 s5, s5, s9 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 +; GFX8-NEXT: s_max_i32 s9, s5, s10 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 -; GFX8-NEXT: s_max_i32 s2, s8, s2 +; GFX8-NEXT: s_max_i32 s2, s9, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 -; GFX8-NEXT: s_lshl_b32 s2, s3, 8 +; GFX8-NEXT: s_lshl_b32 s2, s3, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s2 -; GFX8-NEXT: s_lshl_b32 s3, s6, 8 -; GFX8-NEXT: s_max_i32 s6, s5, s9 +; GFX8-NEXT: s_lshl_b32 s3, s6, s8 +; GFX8-NEXT: s_max_i32 s6, s5, s10 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff -; GFX8-NEXT: s_min_i32 s5, s5, s9 +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 @@ -831,33 +839,33 @@ ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s3, s3, s5 ; GFX8-NEXT: s_sub_i32 s2, s2, s3 -; GFX8-NEXT: s_lshl_b32 s3, s4, 8 +; GFX8-NEXT: s_lshl_b32 s3, s4, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s3 -; GFX8-NEXT: s_max_i32 s6, s5, s9 -; GFX8-NEXT: s_lshl_b32 s4, s7, 8 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_lshl_b32 s4, s7, s8 ; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff -; GFX8-NEXT: s_min_i32 s5, s5, s9 +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 -; GFX8-NEXT: s_ashr_i32 s1, s1, 8 +; GFX8-NEXT: s_ashr_i32 s1, s1, s8 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_ashr_i32 s0, s0, 8 +; GFX8-NEXT: s_ashr_i32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_ashr_i32 s2, s2, 8 +; GFX8-NEXT: s_ashr_i32 s2, s2, s8 ; GFX8-NEXT: s_sub_i32 s3, s3, s4 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NEXT: s_ashr_i32 s3, s3, 8 +; GFX8-NEXT: s_ashr_i32 s3, s3, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, 0xff Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -69,55 +69,56 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_and_b32 s0, 0xffff, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s3, s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: s_lshr_b32 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s2, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s0, s5, 16 +; GFX9-NEXT: s_lshr_b32 s1, s5, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX9-NEXT: s_lshr_b32 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s6 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s6 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s2, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshr_b32 s0, s6, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX9-NEXT: s_lshr_b32 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s7 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s7 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s2, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s0, s7, 16 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 +; GFX9-NEXT: s_lshr_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:15 ; GFX9-NEXT: s_endpgm ; @@ -179,50 +180,51 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, 8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s4, 16 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s4 +; GFX10-NEXT: s_lshr_b32 s1, s4, 16 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_lshr_b32 s2, s5, 16 -; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s1, s1, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: s_lshr_b32 s4, s6, 16 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 +; GFX10-NEXT: s_lshr_b32 s3, s3, s0 +; GFX10-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10-NEXT: s_and_b32 s8, 0xffff, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s0, s3, 8 -; GFX10-NEXT: s_lshr_b32 s3, s2, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: s_lshr_b32 s1, s4, s0 +; GFX10-NEXT: s_lshr_b32 s4, s2, s0 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 ; GFX10-NEXT: v_mov_b32_e32 v7, s6 -; GFX10-NEXT: v_mov_b32_e32 v8, s0 -; GFX10-NEXT: v_mov_b32_e32 v9, s3 +; GFX10-NEXT: v_mov_b32_e32 v5, s2 +; GFX10-NEXT: s_lshr_b32 s2, s8, s0 +; GFX10-NEXT: v_mov_b32_e32 v8, s1 +; GFX10-NEXT: v_mov_b32_e32 v9, s4 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 -; GFX10-NEXT: ds_write_b8 v1, v5 offset:6 ; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: ds_write_b8 v1, v5 offset:6 +; GFX10-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-NEXT: v_mov_b32_e32 v10, s2 -; GFX10-NEXT: s_lshr_b32 s0, s4, 8 +; GFX10-NEXT: s_lshr_b32 s1, s5, s0 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 ; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s7 -; GFX10-NEXT: s_lshr_b32 s1, s7, 16 -; GFX10-NEXT: s_lshr_b32 s0, s0, 8 +; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s7 +; GFX10-NEXT: s_lshr_b32 s2, s7, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s7 -; GFX10-NEXT: v_mov_b32_e32 v3, s0 -; GFX10-NEXT: s_lshr_b32 s0, s1, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: s_lshr_b32 s0, s2, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v5, s0 ; GFX10-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:12 @@ -236,46 +238,48 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_and_b32 s1, 0xffff, 8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX11-NEXT: s_lshr_b32 s1, s4, 16 -; GFX11-NEXT: s_lshr_b32 s2, s2, 8 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s4 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: s_lshr_b32 s0, s5, 16 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: s_lshr_b32 s4, s6, 16 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s6 -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: s_lshr_b32 s1, s3, 8 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s0, s5, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v9, s3 +; GFX11-NEXT: s_lshr_b32 s3, s3, s1 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s5 +; GFX11-NEXT: s_and_b32 s8, 0xffff, s6 +; GFX11-NEXT: s_lshr_b32 s9, s2, s1 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: s_lshr_b32 s5, s6, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, s1 +; GFX11-NEXT: s_lshr_b32 s4, s0, s1 +; GFX11-NEXT: s_lshr_b32 s0, s8, s1 +; GFX11-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: v_mov_b32_e32 v8, s4 ; GFX11-NEXT: ds_store_b8 v1, v0 -; GFX11-NEXT: ds_store_b8 v1, v6 offset:1 -; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 -; GFX11-NEXT: ds_store_b8 v1, v7 offset:3 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:1 +; GFX11-NEXT: ds_store_b8 v1, v3 offset:2 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:3 ; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 -; GFX11-NEXT: ds_store_b8 v1, v8 offset:5 -; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 -; GFX11-NEXT: ds_store_b8 v1, v9 offset:7 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7 -; GFX11-NEXT: s_lshr_b32 s0, s4, 8 -; GFX11-NEXT: s_lshr_b32 s1, s7, 16 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:5 +; GFX11-NEXT: ds_store_b8 v1, v4 offset:6 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:7 +; GFX11-NEXT: v_mov_b32_e32 v3, s5 +; GFX11-NEXT: s_lshr_b32 s2, s7, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: s_lshr_b32 s0, s5, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v5, s7 ; GFX11-NEXT: v_mov_b32_e32 v4, s0 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s7 -; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v7, s1 -; GFX11-NEXT: s_lshr_b32 s0, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v6, s0 -; GFX11-NEXT: s_lshr_b32 s0, s1, 8 +; GFX11-NEXT: s_lshr_b32 s0, s2, s1 ; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 -; GFX11-NEXT: ds_store_b8 v1, v0 offset:9 -; GFX11-NEXT: ds_store_b8 v1, v2 offset:10 +; GFX11-NEXT: ds_store_b8 v1, v0 offset:8 +; GFX11-NEXT: ds_store_b8 v1, v2 offset:9 +; GFX11-NEXT: ds_store_b8 v1, v3 offset:10 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:11 ; GFX11-NEXT: ds_store_b8 v1, v5 offset:12 ; GFX11-NEXT: ds_store_b8 v1, v6 offset:13 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -65,43 +65,44 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_and_b32 s0, 0xffff, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_lshr_b32 s3, s3, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: s_lshr_b32 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s5 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s5 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s2, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: s_lshr_b32 s0, s5, 16 +; GFX9-NEXT: s_lshr_b32 s1, s5, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX9-NEXT: s_lshr_b32 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s6 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s6 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s2, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: s_lshr_b32 s0, s6, 16 +; GFX9-NEXT: s_lshr_b32 s1, s6, 16 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX9-NEXT: s_lshr_b32 s1, s0, 8 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX9-NEXT: s_lshr_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX9-NEXT: s_endpgm ; @@ -152,30 +153,29 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-NEXT: s_and_b32 s0, 0xffff, 8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s0, s4, 16 +; GFX10-NEXT: s_lshr_b32 s1, s4, 16 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: s_lshr_b32 s2, s5, 16 -; GFX10-NEXT: s_and_b32 s3, 0xffff, s5 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_lshr_b32 s4, s6, 16 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s6 +; GFX10-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10-NEXT: s_and_b32 s7, 0xffff, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s6 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v4, s0 -; GFX10-NEXT: s_lshr_b32 s0, s3, 8 -; GFX10-NEXT: s_lshr_b32 s3, s2, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-NEXT: s_lshr_b32 s1, s4, s0 +; GFX10-NEXT: s_lshr_b32 s4, s2, s0 +; GFX10-NEXT: s_lshr_b32 s3, s3, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 -; GFX10-NEXT: v_mov_b32_e32 v9, s3 -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v8, s0 -; GFX10-NEXT: v_mov_b32_e32 v10, s2 -; GFX10-NEXT: s_lshr_b32 s0, s4, 8 +; GFX10-NEXT: s_lshr_b32 s2, s7, s0 +; GFX10-NEXT: v_mov_b32_e32 v9, s4 +; GFX10-NEXT: v_mov_b32_e32 v6, s3 ; GFX10-NEXT: v_mov_b32_e32 v7, s6 +; GFX10-NEXT: v_mov_b32_e32 v8, s1 ; GFX10-NEXT: ds_write_b8 v1, v0 ; GFX10-NEXT: ds_write_b8 v1, v2 offset:4 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:2 @@ -183,13 +183,15 @@ ; GFX10-NEXT: ds_write_b8 v1, v6 offset:1 ; GFX10-NEXT: ds_write_b8 v1, v7 offset:3 ; GFX10-NEXT: ds_write_b8 v1, v8 offset:5 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_lshr_b32 s0, s5, s0 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: ds_write_b8 v1, v9 offset:7 ; GFX10-NEXT: ds_write_b8 v1, v3 offset:8 -; GFX10-NEXT: ds_write_b8 v1, v10 offset:9 -; GFX10-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX10-NEXT: ds_write_b8 v1, v2 offset:11 +; GFX10-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX10-NEXT: ds_write_b8 v1, v2 offset:10 +; GFX10-NEXT: ds_write_b8 v1, v4 offset:11 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align1: @@ -197,26 +199,27 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_and_b32 s1, 0xffff, 8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, 0xffff, s4 -; GFX11-NEXT: s_lshr_b32 s1, s4, 16 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s4 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: s_lshr_b32 s4, s6, 16 -; GFX11-NEXT: s_lshr_b32 s2, s2, 8 ; GFX11-NEXT: s_lshr_b32 s0, s5, 16 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s5 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s5 ; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s6 -; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s2 -; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: s_lshr_b32 s1, s3, 8 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s0, s5, 8 -; GFX11-NEXT: s_lshr_b32 s5, s4, 8 -; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s1 -; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v12, s5 +; GFX11-NEXT: s_lshr_b32 s5, s6, 16 +; GFX11-NEXT: s_lshr_b32 s3, s3, s1 +; GFX11-NEXT: s_and_b32 s7, 0xffff, s6 +; GFX11-NEXT: s_lshr_b32 s6, s2, s1 +; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v7, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_lshr_b32 s2, s4, s1 +; GFX11-NEXT: s_lshr_b32 s4, s0, s1 +; GFX11-NEXT: s_lshr_b32 s0, s7, s1 +; GFX11-NEXT: s_lshr_b32 s1, s5, s1 +; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v12, s1 ; GFX11-NEXT: ds_store_b8 v1, v0 ; GFX11-NEXT: ds_store_b8 v1, v7 offset:1 ; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -60,8 +60,9 @@ ; ; GFX8-LABEL: s_uaddsat_i7: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s1, s1, 9 -; GFX8-NEXT: s_lshl_b32 s0, s0, 9 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 @@ -70,8 +71,9 @@ ; ; GFX9-LABEL: s_uaddsat_i7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s1, s1, 9 -; GFX9-NEXT: s_lshl_b32 s0, s0, 9 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 @@ -80,8 +82,9 @@ ; ; GFX10PLUS-LABEL: s_uaddsat_i7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 +; GFX10PLUS-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -145,8 +148,9 @@ ; ; GFX8-LABEL: s_uaddsat_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -155,8 +159,9 @@ ; ; GFX9-LABEL: s_uaddsat_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -165,8 +170,9 @@ ; ; GFX10PLUS-LABEL: s_uaddsat_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10PLUS-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -297,14 +303,15 @@ ; ; GFX8-LABEL: s_uaddsat_v2i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s4, 0xffff, 8 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s3, 8 +; GFX8-NEXT: s_lshl_b32 s1, s3, s4 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: s_lshl_b32 s0, s2, 8 +; GFX8-NEXT: s_lshl_b32 s0, s2, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 @@ -602,28 +609,29 @@ ; ; GFX8-LABEL: s_uaddsat_v4i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s8, 0xffff, 8 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s5, 8 +; GFX8-NEXT: s_lshl_b32 s1, s5, s8 ; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: s_lshl_b32 s0, s2, 8 +; GFX8-NEXT: s_lshl_b32 s0, s2, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp -; GFX8-NEXT: s_lshl_b32 s1, s6, 8 +; GFX8-NEXT: s_lshl_b32 s1, s6, s8 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s3, 8 +; GFX8-NEXT: s_lshl_b32 s0, s3, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 +; GFX8-NEXT: s_lshl_b32 s1, s7, s8 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp -; GFX8-NEXT: s_lshl_b32 s0, s4, 8 +; GFX8-NEXT: s_lshl_b32 s0, s4, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -58,8 +58,9 @@ ; ; GFX8-LABEL: s_usubsat_i7: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s1, s1, 9 -; GFX8-NEXT: s_lshl_b32 s0, s0, 9 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 @@ -68,8 +69,9 @@ ; ; GFX9-LABEL: s_usubsat_i7: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s1, s1, 9 -; GFX9-NEXT: s_lshl_b32 s0, s0, 9 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 @@ -78,8 +80,9 @@ ; ; GFX10PLUS-LABEL: s_usubsat_i7: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 +; GFX10PLUS-NEXT: s_and_b32 s2, 0xffff, 9 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -141,8 +144,9 @@ ; ; GFX8-LABEL: s_usubsat_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -151,8 +155,9 @@ ; ; GFX9-LABEL: s_usubsat_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, 8 +; GFX9-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 @@ -161,8 +166,9 @@ ; ; GFX10PLUS-LABEL: s_usubsat_i8: ; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 -; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10PLUS-NEXT: s_and_b32 s2, 0xffff, 8 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp ; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0 ; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 @@ -289,14 +295,15 @@ ; ; GFX8-LABEL: s_usubsat_v2i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s4, 0xffff, 8 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s3, 8 +; GFX8-NEXT: s_lshl_b32 s1, s3, s4 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: s_lshl_b32 s0, s2, 8 +; GFX8-NEXT: s_lshl_b32 s0, s2, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 @@ -586,28 +593,29 @@ ; ; GFX8-LABEL: s_usubsat_v4i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s8, 0xffff, 8 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshl_b32 s1, s1, s8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshl_b32 s0, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s5, 8 +; GFX8-NEXT: s_lshl_b32 s1, s5, s8 ; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp -; GFX8-NEXT: s_lshl_b32 s0, s2, 8 +; GFX8-NEXT: s_lshl_b32 s0, s2, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp -; GFX8-NEXT: s_lshl_b32 s1, s6, 8 +; GFX8-NEXT: s_lshl_b32 s1, s6, s8 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s3, 8 +; GFX8-NEXT: s_lshl_b32 s0, s3, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 +; GFX8-NEXT: s_lshl_b32 s1, s7, s8 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp -; GFX8-NEXT: s_lshl_b32 s0, s4, 8 +; GFX8-NEXT: s_lshl_b32 s0, s4, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 Index: llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -790,3 +790,139 @@ S_ENDPGM 0, implicit %4 ... + +--- +name: constant_s_and_b32_only_implicit_def_scc_is_used +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_s_and_b32_only_implicit_def_scc_is_used + ; GCN: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 32, 15, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0, implicit $scc + %0:sgpr_32 = S_MOV_B32 32 + %1:sgpr_32 = S_MOV_B32 15 + %2:sgpr_32 = S_AND_B32 %0, %1, implicit-def $scc + S_ENDPGM 0, implicit $scc + +... + +--- +name: constant_s_and_b32_implicit_def_scc_is_used +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_s_and_b32_implicit_def_scc_is_used + ; GCN: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 32, 15, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0, implicit [[S_AND_B32_]], implicit $scc + %0:sgpr_32 = S_MOV_B32 32 + %1:sgpr_32 = S_MOV_B32 15 + %2:sgpr_32 = S_AND_B32 %0, %1, implicit-def $scc + S_ENDPGM 0, implicit %2, implicit $scc + +... + +--- +name: constant_s_and_b32_only_implicit_def_scc_is_dead +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_s_and_b32_only_implicit_def_scc_is_dead + ; GCN: S_ENDPGM 0, implicit undef $scc + %0:sgpr_32 = S_MOV_B32 32 + %1:sgpr_32 = S_MOV_B32 15 + %2:sgpr_32 = S_AND_B32 %0, %1, implicit-def dead $scc + S_ENDPGM 0, implicit undef $scc + +... + +--- +name: constant_s_or_b32_only_implicit_def_scc_is_used +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_s_or_b32_only_implicit_def_scc_is_used + ; GCN: [[S_OR_B32_:%[0-9]+]]:sgpr_32 = S_OR_B32 32, 15, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0, implicit $scc + %0:sgpr_32 = S_MOV_B32 32 + %1:sgpr_32 = S_MOV_B32 15 + %2:sgpr_32 = S_OR_B32 %0, %1, implicit-def $scc + S_ENDPGM 0, implicit $scc + +... + +--- +name: constant_s_xor_b32_only_implicit_def_scc_is_used +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_s_xor_b32_only_implicit_def_scc_is_used + ; GCN: [[S_XOR_B32_:%[0-9]+]]:sgpr_32 = S_XOR_B32 32, 15, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0, implicit $scc + %0:sgpr_32 = S_MOV_B32 32 + %1:sgpr_32 = S_MOV_B32 15 + %2:sgpr_32 = S_XOR_B32 %0, %1, implicit-def $scc + S_ENDPGM 0, implicit $scc + +... + +--- +name: constant_s_not_b32_only_implicit_def_scc_is_used +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_s_not_b32_only_implicit_def_scc_is_used + ; GCN: [[S_NOT_B32_:%[0-9]+]]:sgpr_32 = S_NOT_B32 32, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0, implicit $scc + %0:sgpr_32 = S_MOV_B32 32 + %1:sgpr_32 = S_NOT_B32 %0, implicit-def $scc + S_ENDPGM 0, implicit $scc + +... + +# Really really implicit operand +--- +name: constant_v_and_b32_implicit_def_scc_is_used +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_v_and_b32_implicit_def_scc_is_used + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 15, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 32, [[V_MOV_B32_e32_]], implicit $exec, implicit-def $scc + ; GCN-NEXT: S_ENDPGM 0, implicit $scc + %0:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 15, implicit $exec + %2:vgpr_32 = V_AND_B32_e32 %0, %1, implicit $exec, implicit-def $scc + S_ENDPGM 0, implicit $scc + +... + +--- +name: constant_v_and_b32_implicit_def_vreg_is_used +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_v_and_b32_implicit_def_vreg_is_used + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 15, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 32, [[V_MOV_B32_e32_]], implicit $exec, implicit-def %3 + ; GCN-NEXT: S_ENDPGM 0, implicit %3 + %0:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 15, implicit $exec + %2:vgpr_32 = V_AND_B32_e32 %0, %1, implicit $exec, implicit-def %3:vgpr_32 + S_ENDPGM 0, implicit %3 + +... +--- +name: constant_v_and_b32_implicit_use +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_v_and_b32_implicit_use + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] + %0:vgpr_32 = V_MOV_B32_e32 32, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 15, implicit $exec + %2:vgpr_32 = V_MOV_B32_e32 24, implicit $exec + %3:vgpr_32 = V_AND_B32_e32 %0, %1, implicit $exec, implicit %2:vgpr_32 + S_ENDPGM 0, implicit %3 + +... Index: llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -76,16 +76,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; VI-NEXT: v_mac_f32_e32 v2, v0, v1 -; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 -; VI-NEXT: s_setpc_b64 s[30:31] +; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: ; SDAG-CI: ; %bb.0: @@ -96,6 +96,19 @@ ; SDAG-CI-NEXT: v_mov_b32_e32 v0, 1.0 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-VI-NEXT: s_mov_b32 s4, 0xffff +; GISEL-VI-NEXT: s_and_b32 s4, s4, 0x3c00 +; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, s4, v0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) Index: llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -0,0 +1,319 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +%"struct.__llvm_libc::rpc::Buffer" = type { [8 x i64] } + +define void @issue63986(i64 %0, i64 %idxprom) { +; CHECK-LABEL: issue63986: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b64 v[4:5], 6, v[2:3] +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: v_lshlrev_b64 v[6:7], 6, v[2:3] +; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: .LBB0_2: ; %loop-memcpy-expansion +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_mov_b32_e32 v9, s7 +; CHECK-NEXT: v_mov_b32_e32 v8, s6 +; CHECK-NEXT: flat_load_ubyte v10, v[8:9] offset:5 +; CHECK-NEXT: flat_load_ubyte v11, v[8:9] offset:6 +; CHECK-NEXT: flat_load_ubyte v12, v[8:9] offset:7 +; CHECK-NEXT: flat_load_ubyte v13, v[8:9] offset:3 +; CHECK-NEXT: flat_load_ubyte v14, v[8:9] offset:2 +; CHECK-NEXT: flat_load_ubyte v15, v[8:9] offset:1 +; CHECK-NEXT: flat_load_ubyte v16, v[8:9] +; CHECK-NEXT: flat_load_ubyte v17, v[8:9] offset:4 +; CHECK-NEXT: flat_load_ubyte v18, v[8:9] offset:13 +; CHECK-NEXT: flat_load_ubyte v19, v[8:9] offset:14 +; CHECK-NEXT: flat_load_ubyte v20, v[8:9] offset:15 +; CHECK-NEXT: flat_load_ubyte v21, v[8:9] offset:11 +; CHECK-NEXT: flat_load_ubyte v22, v[8:9] offset:10 +; CHECK-NEXT: flat_load_ubyte v23, v[8:9] offset:9 +; CHECK-NEXT: flat_load_ubyte v24, v[8:9] offset:8 +; CHECK-NEXT: flat_load_ubyte v25, v[8:9] offset:12 +; CHECK-NEXT: s_add_u32 s4, s4, 1 +; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s6, v6 +; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[4:5], 2 +; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v7, vcc +; CHECK-NEXT: s_add_u32 s6, s6, 16 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[8:9], v13 offset:3 +; CHECK-NEXT: flat_store_byte v[8:9], v14 offset:2 +; CHECK-NEXT: flat_store_byte v[8:9], v15 offset:1 +; CHECK-NEXT: flat_store_byte v[8:9], v16 +; CHECK-NEXT: flat_store_byte v[8:9], v12 offset:7 +; CHECK-NEXT: flat_store_byte v[8:9], v11 offset:6 +; CHECK-NEXT: flat_store_byte v[8:9], v10 offset:5 +; CHECK-NEXT: flat_store_byte v[8:9], v17 offset:4 +; CHECK-NEXT: flat_store_byte v[8:9], v21 offset:11 +; CHECK-NEXT: flat_store_byte v[8:9], v22 offset:10 +; CHECK-NEXT: flat_store_byte v[8:9], v23 offset:9 +; CHECK-NEXT: flat_store_byte v[8:9], v24 offset:8 +; CHECK-NEXT: flat_store_byte v[8:9], v20 offset:15 +; CHECK-NEXT: flat_store_byte v[8:9], v19 offset:14 +; CHECK-NEXT: flat_store_byte v[8:9], v18 offset:13 +; CHECK-NEXT: flat_store_byte v[8:9], v25 offset:12 +; CHECK-NEXT: s_cbranch_vccz .LBB0_2 +; CHECK-NEXT: ; %bb.3: ; %loop-memcpy-residual-header +; CHECK-NEXT: s_and_b32 s4, 32, 15 +; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 +; CHECK-NEXT: ; %bb.4: +; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: s_branch .LBB0_6 +; CHECK-NEXT: .LBB0_5: ; %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge +; CHECK-NEXT: v_lshlrev_b64 v[2:3], 6, v[2:3] +; CHECK-NEXT: s_cbranch_execnz .LBB0_9 +; CHECK-NEXT: .LBB0_6: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: v_or_b32_e32 v2, 32, v4 +; CHECK-NEXT: v_mov_b32_e32 v3, v5 +; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: .LBB0_7: ; %loop-memcpy-residual +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_add_u32 s8, 32, s6 +; CHECK-NEXT: s_addc_u32 s9, 0, s7 +; CHECK-NEXT: v_mov_b32_e32 v6, s8 +; CHECK-NEXT: v_mov_b32_e32 v7, s9 +; CHECK-NEXT: flat_load_ubyte v10, v[6:7] +; CHECK-NEXT: v_mov_b32_e32 v9, s7 +; CHECK-NEXT: v_mov_b32_e32 v7, s5 +; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s6, v2 +; CHECK-NEXT: s_add_u32 s6, s6, 1 +; CHECK-NEXT: v_mov_b32_e32 v6, s4 +; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v3, v9, vcc +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[8:9], v10 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_7 +; CHECK-NEXT: ; %bb.8: +; CHECK-NEXT: v_mov_b32_e32 v2, v4 +; CHECK-NEXT: v_mov_b32_e32 v3, v5 +; CHECK-NEXT: .LBB0_9: ; %post-loop-memcpy-expansion +; CHECK-NEXT: v_lshrrev_b64 v[4:5], 4, v[0:1] +; CHECK-NEXT: v_and_b32_e32 v6, 15, v0 +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: v_and_b32_e32 v0, -16, v0 +; CHECK-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[4:5] +; CHECK-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[6:7] +; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, v2, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v3, v1, vcc +; CHECK-NEXT: s_branch .LBB0_12 +; CHECK-NEXT: .LBB0_10: ; %Flow19 +; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 +; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] +; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: .LBB0_11: ; %Flow21 +; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; CHECK-NEXT: s_cbranch_vccz .LBB0_20 +; CHECK-NEXT: .LBB0_12: ; %while.cond +; CHECK-NEXT: ; =>This Loop Header: Depth=1 +; CHECK-NEXT: ; Child Loop BB0_14 Depth 2 +; CHECK-NEXT: ; Child Loop BB0_18 Depth 2 +; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; CHECK-NEXT: s_cbranch_execz .LBB0_15 +; CHECK-NEXT: ; %bb.13: ; %loop-memcpy-expansion2.preheader +; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 +; CHECK-NEXT: s_mov_b64 s[10:11], 0 +; CHECK-NEXT: s_mov_b64 s[12:13], 0 +; CHECK-NEXT: s_mov_b64 s[14:15], 0 +; CHECK-NEXT: .LBB0_14: ; %loop-memcpy-expansion2 +; CHECK-NEXT: ; Parent Loop BB0_12 Depth=1 +; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NEXT: v_mov_b32_e32 v10, s10 +; CHECK-NEXT: v_mov_b32_e32 v11, s11 +; CHECK-NEXT: flat_load_ubyte v12, v[10:11] offset:5 +; CHECK-NEXT: flat_load_ubyte v13, v[10:11] offset:6 +; CHECK-NEXT: flat_load_ubyte v14, v[10:11] offset:7 +; CHECK-NEXT: flat_load_ubyte v15, v[10:11] offset:3 +; CHECK-NEXT: flat_load_ubyte v16, v[10:11] offset:2 +; CHECK-NEXT: flat_load_ubyte v17, v[10:11] offset:1 +; CHECK-NEXT: flat_load_ubyte v18, v[10:11] +; CHECK-NEXT: flat_load_ubyte v19, v[10:11] offset:4 +; CHECK-NEXT: flat_load_ubyte v20, v[10:11] offset:13 +; CHECK-NEXT: flat_load_ubyte v21, v[10:11] offset:14 +; CHECK-NEXT: flat_load_ubyte v22, v[10:11] offset:15 +; CHECK-NEXT: flat_load_ubyte v23, v[10:11] offset:11 +; CHECK-NEXT: flat_load_ubyte v24, v[10:11] offset:10 +; CHECK-NEXT: flat_load_ubyte v25, v[10:11] offset:9 +; CHECK-NEXT: flat_load_ubyte v26, v[10:11] offset:8 +; CHECK-NEXT: flat_load_ubyte v27, v[10:11] offset:12 +; CHECK-NEXT: s_add_u32 s14, s14, 1 +; CHECK-NEXT: v_add_co_u32_e32 v10, vcc, s10, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v3, vcc +; CHECK-NEXT: s_addc_u32 s15, s15, 0 +; CHECK-NEXT: s_add_u32 s10, s10, 16 +; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5] +; CHECK-NEXT: s_addc_u32 s11, s11, 0 +; CHECK-NEXT: s_or_b64 s[12:13], vcc, s[12:13] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[10:11], v15 offset:3 +; CHECK-NEXT: flat_store_byte v[10:11], v16 offset:2 +; CHECK-NEXT: flat_store_byte v[10:11], v17 offset:1 +; CHECK-NEXT: flat_store_byte v[10:11], v18 +; CHECK-NEXT: flat_store_byte v[10:11], v14 offset:7 +; CHECK-NEXT: flat_store_byte v[10:11], v13 offset:6 +; CHECK-NEXT: flat_store_byte v[10:11], v12 offset:5 +; CHECK-NEXT: flat_store_byte v[10:11], v19 offset:4 +; CHECK-NEXT: flat_store_byte v[10:11], v23 offset:11 +; CHECK-NEXT: flat_store_byte v[10:11], v24 offset:10 +; CHECK-NEXT: flat_store_byte v[10:11], v25 offset:9 +; CHECK-NEXT: flat_store_byte v[10:11], v26 offset:8 +; CHECK-NEXT: flat_store_byte v[10:11], v22 offset:15 +; CHECK-NEXT: flat_store_byte v[10:11], v21 offset:14 +; CHECK-NEXT: flat_store_byte v[10:11], v20 offset:13 +; CHECK-NEXT: flat_store_byte v[10:11], v27 offset:12 +; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] +; CHECK-NEXT: s_cbranch_execnz .LBB0_14 +; CHECK-NEXT: .LBB0_15: ; %Flow20 +; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: s_cbranch_execz .LBB0_11 +; CHECK-NEXT: ; %bb.16: ; %loop-memcpy-residual-header5 +; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 +; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; CHECK-NEXT: s_xor_b64 s[10:11], exec, s[8:9] +; CHECK-NEXT: s_cbranch_execz .LBB0_10 +; CHECK-NEXT: ; %bb.17: ; %loop-memcpy-residual4.preheader +; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 +; CHECK-NEXT: s_mov_b64 s[12:13], 0 +; CHECK-NEXT: s_mov_b64 s[14:15], 0 +; CHECK-NEXT: .LBB0_18: ; %loop-memcpy-residual4 +; CHECK-NEXT: ; Parent Loop BB0_12 Depth=1 +; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NEXT: v_mov_b32_e32 v12, s15 +; CHECK-NEXT: v_add_co_u32_e32 v10, vcc, s14, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v12, vcc +; CHECK-NEXT: flat_load_ubyte v13, v[10:11] +; CHECK-NEXT: v_add_co_u32_e32 v10, vcc, s14, v8 +; CHECK-NEXT: s_add_u32 s14, s14, 1 +; CHECK-NEXT: s_addc_u32 s15, s15, 0 +; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[14:15], v[6:7] +; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v9, v12, vcc +; CHECK-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[10:11], v13 +; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] +; CHECK-NEXT: s_cbranch_execnz .LBB0_18 +; CHECK-NEXT: ; %bb.19: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_12 Depth=1 +; CHECK-NEXT: s_or_b64 exec, exec, s[12:13] +; CHECK-NEXT: s_branch .LBB0_10 +; CHECK-NEXT: .LBB0_20: ; %DummyReturnBlock +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %arrayidx = getelementptr [32 x %"struct.__llvm_libc::rpc::Buffer"], ptr null, i64 0, i64 %idxprom + %spec.select = tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56) + tail call void @llvm.memcpy.p0.p0.i64(ptr %arrayidx, ptr null, i64 %spec.select, i1 false) + br label %while.cond + +while.cond: ; preds = %while.cond + tail call void @llvm.memcpy.p0.p0.i64(ptr %arrayidx, ptr null, i64 %0, i1 false) + br label %while.cond +} + +define void @issue63986_reduced_expanded(i64 %idxprom) { +; CHECK-LABEL: issue63986_reduced_expanded: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: ; %bb.1: ; %loop-memcpy-expansion.preheader +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: ; %bb.2: ; %loop-memcpy-residual-header +; CHECK-NEXT: s_and_b32 s4, 32, 15 +; CHECK-NEXT: s_mov_b32 s5, 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB1_4 +; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: s_branch .LBB1_5 +; CHECK-NEXT: .LBB1_4: ; %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge +; CHECK-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; CHECK-NEXT: s_cbranch_execnz .LBB1_8 +; CHECK-NEXT: .LBB1_5: ; %loop-memcpy-residual.preheader +; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: .LBB1_6: ; %loop-memcpy-residual +; CHECK-NEXT: s_add_u32 s6, s6, 1 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: s_addc_u32 s7, s7, 0 +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; CHECK-NEXT: s_mov_b64 s[6:7], 1 +; CHECK-NEXT: s_cbranch_vccnz .LBB1_6 +; CHECK-NEXT: ; %bb.7: ; %Flow +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: .LBB1_8: ; %post-loop-memcpy-expansion +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_and_b64 vcc, exec, 0 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:1 +; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:5 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:4 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:11 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:10 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:9 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:8 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:14 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:13 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:12 +; CHECK-NEXT: .LBB1_9: ; %loop-memcpy-expansion2 +; CHECK-NEXT: s_mov_b64 vcc, vcc +; CHECK-NEXT: s_cbranch_vccz .LBB1_9 +; CHECK-NEXT: ; %bb.10: ; %DummyReturnBlock +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %spec.select = tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56) + %i = trunc i64 %spec.select to i32 + %i1 = urem i32 %i, 16 + %i2 = zext i32 %i to i64 + %i3 = zext i32 %i1 to i64 + %i4 = icmp ne i64 %i2, 0 + br i1 %i4, label %loop-memcpy-expansion.preheader, label %loop-memcpy-residual-header + +loop-memcpy-expansion.preheader: ; preds = %entry + ret void + +loop-memcpy-residual: ; preds = %loop-memcpy-residual.preheader, %loop-memcpy-residual + %residual-loop-index1 = phi i64 [ 1, %loop-memcpy-residual ], [ 0, %loop-memcpy-residual.preheader ] + %i5 = add i64 %residual-loop-index1, 1 + %i6 = icmp ult i64 %i5, %i3 + br i1 %i6, label %loop-memcpy-residual, label %post-loop-memcpy-expansion + +post-loop-memcpy-expansion: ; preds = %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge, %loop-memcpy-residual + %.pre-phi = phi i64 [ %.pre, %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge ], [ 0, %loop-memcpy-residual ] + br label %loop-memcpy-expansion2 + +loop-memcpy-expansion2: ; preds = %loop-memcpy-expansion2, %post-loop-memcpy-expansion + %scevgep7 = getelementptr i8, ptr null, i64 %.pre-phi + store <4 x i32> zeroinitializer, ptr %scevgep7, align 1 + br label %loop-memcpy-expansion2 + +loop-memcpy-residual-header: ; preds = %entry + %i7 = icmp ne i64 %i3, 0 + br i1 %i7, label %loop-memcpy-residual.preheader, label %loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge + +loop-memcpy-residual-header.post-loop-memcpy-expansion_crit_edge: ; preds = %loop-memcpy-residual-header + %.pre = shl i64 %idxprom, 1 + br label %post-loop-memcpy-expansion + +loop-memcpy-residual.preheader: ; preds = %loop-memcpy-residual-header + br label %loop-memcpy-residual +} + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0 +declare i64 @llvm.umin.i64(i64, i64) #1 + +attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }