diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -237,6 +237,36 @@ def AMDGPUmul_u24_oneuse : HasOneUseBinOp; def AMDGPUmul_i24_oneuse : HasOneUseBinOp; +//===----------------------------------------------------------------------===// +// PatFrags for shifts +//===----------------------------------------------------------------------===// + +// Constrained shift PatFrags. +foreach width = [16, 32, 64] in { +defvar mask = !sub(width, 1); + +def cshl_#width : PatFrags<(ops node:$src0, node:$src1), + [(shl node:$src0, node:$src1), (shl node:$src0, (and node:$src1, mask))]>; +defvar cshl = !cast("cshl_"#width); +def cshl_#width#_oneuse : HasOneUseBinOp; +def clshl_rev_#width : PatFrag <(ops node:$src0, node:$src1), + (cshl $src1, $src0)>; + +def csrl_#width : PatFrags<(ops node:$src0, node:$src1), + [(srl node:$src0, node:$src1), (srl node:$src0, (and node:$src1, mask))]>; +defvar csrl = !cast("csrl_"#width); +def csrl_#width#_oneuse : HasOneUseBinOp; +def clshr_rev_#width : PatFrag <(ops node:$src0, node:$src1), + (csrl $src1, $src0)>; + +def csra_#width : PatFrags<(ops node:$src0, node:$src1), + [(sra node:$src0, node:$src1), (sra node:$src0, (and node:$src1, mask))]>; +defvar csra = !cast("csra_"#width); +def csra_#width#_oneuse : HasOneUseBinOp; +def cashr_rev_#width : PatFrag <(ops node:$src0, node:$src1), + (csra $src1, $src0)>; +} // end foreach width + def srl_16 : PatFrag< (ops node:$src0), (srl_oneuse node:$src0, (i32 16)) >; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -598,22 +598,22 @@ let Defs = [SCC] in { // TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3 def S_LSHL_B32 : SOP2_32 <"s_lshl_b32", - [(set SReg_32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64", - [(set SReg_64:$sdst, (UniformBinFrag (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_64:$sdst, (UniformBinFrag (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHR_B32 : SOP2_32 <"s_lshr_b32", - [(set SReg_32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64", - [(set SReg_64:$sdst, (UniformBinFrag (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_64:$sdst, (UniformBinFrag (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; def S_ASHR_I32 : SOP2_32 <"s_ashr_i32", - [(set SReg_32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_32:$sdst, (UniformBinFrag (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))] >; def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64", - [(set SReg_64:$sdst, (UniformBinFrag (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] + [(set SReg_64:$sdst, (UniformBinFrag (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))] >; } // End Defs = [SCC] diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -501,9 +501,9 @@ defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN, smax>; defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN, umin>; defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN, umax>; -defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, lshr_rev, "v_lshr_b32">; -defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, ashr_rev, "v_ashr_i32">; -defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, lshl_rev, "v_lshl_b32">; +defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, clshr_rev_32, "v_lshr_b32">; +defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, cashr_rev_32, "v_ashr_i32">; +defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, clshl_rev_32, "v_lshl_b32">; defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN, and>; defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN, or>; defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN, xor>; @@ -578,9 +578,9 @@ let isCommutable = 1 in { let SubtargetPredicate = isGFX6GFX7 in { -defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN, srl>; -defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN, sra>; -defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN, shl>; +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN, csrl_32>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN, csra_32>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN, cshl_32>; } // End SubtargetPredicate = isGFX6GFX7 } // End isCommutable = 1 } // End isReMaterializable = 1 @@ -605,9 +605,9 @@ ) >; -def : DivergentBinOp; -def : DivergentBinOp; -def : DivergentBinOp; +def : DivergentBinOp; +def : DivergentBinOp; +def : DivergentBinOp; let SubtargetPredicate = HasAddNoCarryInsts in { def : DivergentClampingBinOp; @@ -648,9 +648,9 @@ defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; } // End FPDPRounding = 1 -defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, lshl_rev>; -defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, lshr_rev>; -defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>; +defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>; +defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, clshr_rev_16>; +defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, cashr_rev_16>; let isCommutable = 1 in { let FPDPRounding = 1 in { @@ -852,9 +852,9 @@ defm : Arithmetic_i16_0Hi_Pats; defm : Arithmetic_i16_0Hi_Pats; defm : Arithmetic_i16_0Hi_Pats; -defm : Arithmetic_i16_0Hi_Pats; -defm : Arithmetic_i16_0Hi_Pats; -defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; +defm : Arithmetic_i16_0Hi_Pats; } // End Predicates = [Has16BitInsts, isGFX7GFX8GFX9] def : ZExt_i16_i1_Pat; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -400,15 +400,15 @@ let SchedRW = [Write64Bit] in { let SubtargetPredicate = isGFX6GFX7 in { - defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile, shl>; - defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile, srl>; - defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile, sra>; + defm V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile, cshl_64>; + defm V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile, csrl_64>; + defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile, csra_64>; } // End SubtargetPredicate = isGFX6GFX7 let SubtargetPredicate = isGFX8Plus in { - defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile, lshl_rev>; - defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile, lshr_rev>; - defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile, ashr_rev>; + defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile, clshl_rev_64>; + defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile, clshr_rev_64>; + defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile, cashr_rev_64>; } // End SubtargetPredicate = isGFX8Plus } // End SchedRW = [Write64Bit] } // End isReMaterializable = 1 @@ -656,10 +656,10 @@ (inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) >; -def : ThreeOp_i32_Pats; -def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; -def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -82,9 +82,9 @@ defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile>; defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile, sub>; -defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile, lshl_rev>; -defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, ashr_rev>; -defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, lshr_rev>; +defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile, clshl_rev_16>; +defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, cashr_rev_16>; +defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, clshr_rev_16>; let SubtargetPredicate = HasVOP3PInsts in { diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -759,10 +759,11 @@ int ret = TP.NumOperands; } - class getDivergentFrag { + assert !or(!isa(Op), !isa(Op)), "Expected SDNode or PatFrags"; - int NumSrcArgs = getNumNodeArgs.ret; + int NumSrcArgs = !if(!isa(Op), getNumNodeArgs.ret, + !size(!cast(Op).Operands)); PatFrag ret = PatFrag < !if(!eq(NumSrcArgs, 1), (ops node:$src0), diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -2898,24 +2898,20 @@ ; GFX8-LABEL: v_fshl_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 15, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v3, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2924,9 +2920,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3025,39 +3019,33 @@ ; ; GFX8-LABEL: v_fshl_i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i16_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: s_bfe_u32 s0, s1, 0x100000 ; GFX9-NEXT: s_bfe_u32 s1, 1, 0x100000 -; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 -; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i16_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -3310,21 +3298,17 @@ ; GFX8-LABEL: v_fshl_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v4, 1 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -3447,25 +3431,21 @@ ; ; GFX8-LABEL: v_fshl_v2i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v1 ; GFX8-NEXT: s_lshr_b32 s0, s3, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -3944,37 +3924,28 @@ ; GFX8-LABEL: v_fshl_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v8, v4 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v8, 1 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, v8, v9 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v6 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v6, 1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v8, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v7 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v5, 1 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v8 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v7 +; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -2751,24 +2751,20 @@ ; GFX8-LABEL: v_fshr_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 15, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2778,8 +2774,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2879,36 +2873,30 @@ ; ; GFX8-LABEL: v_fshr_i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i16_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 -; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s1 -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s1 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i16_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3214,24 +3202,20 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v6, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -3377,31 +3361,27 @@ ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 ; GFX8-NEXT: s_lshr_b32 s5, s5, s6 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: s_lshr_b32 s5, s3, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, s4 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s2, s2, s4 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 ; GFX8-NEXT: s_bfe_u32 s0, s3, 0x100000 ; GFX8-NEXT: s_or_b32 s2, s2, s5 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v1 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -4025,24 +4005,20 @@ ; GFX8-NEXT: v_mov_b32_e32 v8, 15 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v10, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, v10, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v7 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_xor_b32_e32 v10, -1, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v9 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, v10, v6 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v7, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -4053,24 +4029,20 @@ ; GFX8-NEXT: v_mov_b32_e32 v6, 1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v3 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v4, v8, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v7 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v5, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, v8, v5 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v6, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -0,0 +1,197 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s + +define i16 @csh_16(i16 %a, i16 %b) { +; CHECK-LABEL: csh_16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b16_e32 v2, v1, v0 +; CHECK-NEXT: v_lshrrev_b16_e32 v3, v1, v0 +; CHECK-NEXT: v_ashrrev_i16_e32 v0, v1, v0 +; CHECK-NEXT: v_add_u16_e32 v1, v2, v3 +; CHECK-NEXT: v_add_u16_e32 v0, v1, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %and = and i16 %b, 15 + %shl = shl i16 %a, %and + %lshr = lshr i16 %a, %and + %ashr = ashr i16 %a, %and + %ret.0 = add i16 %shl, %lshr + %ret = add i16 %ret.0, %ashr + ret i16 %ret +} + +define i32 @csh_32(i32 %a, i32 %b) { +; CHECK-LABEL: csh_32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b32_e32 v2, v1, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, v1, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, v1, v0 +; CHECK-NEXT: v_add3_u32 v0, v2, v3, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %b, 31 + %shl = shl i32 %a, %and + %lshr = lshr i32 %a, %and + %ashr = ashr i32 %a, %and + %ret.0 = add i32 %shl, %lshr + %ret = add i32 %ret.0, %ashr + ret i32 %ret +} + +define amdgpu_ps i32 @s_csh_32(i32 inreg %a, i32 inreg %b) { +; CHECK-LABEL: s_csh_32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s2, s0, s1 +; CHECK-NEXT: s_lshr_b32 s3, s0, s1 +; CHECK-NEXT: s_ashr_i32 s0, s0, s1 +; CHECK-NEXT: s_add_i32 s1, s2, s3 +; CHECK-NEXT: s_add_i32 s0, s1, s0 +; CHECK-NEXT: ; return to shader part epilog + %and = and i32 %b, 31 + %shl = shl i32 %a, %and + %lshr = lshr i32 %a, %and + %ashr = ashr i32 %a, %and + %ret.0 = add i32 %shl, %lshr + %ret = add i32 %ret.0, %ashr + ret i32 %ret +} + +define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: csh_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b32_e32 v8, v7, v3 +; CHECK-NEXT: v_lshlrev_b32_e32 v9, v6, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v10, v5, v1 +; CHECK-NEXT: v_lshlrev_b32_e32 v11, v4, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v12, v7, v3 +; CHECK-NEXT: v_lshrrev_b32_e32 v13, v6, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v14, v5, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v15, v4, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, v7, v3 +; CHECK-NEXT: v_ashrrev_i32_e32 v2, v6, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, v5, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, v4, v0 +; CHECK-NEXT: v_add3_u32 v0, v11, v15, v0 +; CHECK-NEXT: v_add3_u32 v1, v10, v14, v1 +; CHECK-NEXT: v_add3_u32 v2, v9, v13, v2 +; CHECK-NEXT: v_add3_u32 v3, v8, v12, v3 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %and = and <4 x i32> %b, + %shl = shl <4 x i32> %a, %and + %lshr = lshr <4 x i32> %a, %and + %ashr = ashr <4 x i32> %a, %and + %ret.0 = add <4 x i32> %shl, %lshr + %ret = add <4 x i32> %ret.0, %ashr + ret <4 x i32> %ret +} + +define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b) { +; CHECK-LABEL: s_csh_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s8, s0, s4 +; CHECK-NEXT: s_lshl_b32 s9, s1, s5 +; CHECK-NEXT: s_lshl_b32 s10, s2, s6 +; CHECK-NEXT: s_lshl_b32 s11, s3, s7 +; CHECK-NEXT: s_lshr_b32 s12, s0, s4 +; CHECK-NEXT: s_lshr_b32 s13, s1, s5 +; CHECK-NEXT: s_lshr_b32 s14, s2, s6 +; CHECK-NEXT: s_lshr_b32 s15, s3, s7 +; CHECK-NEXT: s_ashr_i32 s3, s3, s7 +; CHECK-NEXT: s_ashr_i32 s2, s2, s6 +; CHECK-NEXT: s_ashr_i32 s1, s1, s5 +; CHECK-NEXT: s_ashr_i32 s0, s0, s4 +; CHECK-NEXT: s_add_i32 s4, s11, s15 +; CHECK-NEXT: s_add_i32 s5, s10, s14 +; CHECK-NEXT: s_add_i32 s6, s9, s13 +; CHECK-NEXT: s_add_i32 s7, s8, s12 +; CHECK-NEXT: s_add_i32 s0, s7, s0 +; CHECK-NEXT: s_add_i32 s1, s6, s1 +; CHECK-NEXT: s_add_i32 s2, s5, s2 +; CHECK-NEXT: s_add_i32 s3, s4, s3 +; CHECK-NEXT: ; return to shader part epilog + %and = and <4 x i32> %b, + %shl = shl <4 x i32> %a, %and + %lshr = lshr <4 x i32> %a, %and + %ashr = ashr <4 x i32> %a, %and + %ret.0 = add <4 x i32> %shl, %lshr + %ret = add <4 x i32> %ret.0, %ashr + ret <4 x i32> %ret +} + +define i64 @csh_64(i64 %a, i64 %b) { +; CHECK-LABEL: csh_64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b64 v[3:4], v2, v[0:1] +; CHECK-NEXT: v_lshrrev_b64 v[5:6], v2, v[0:1] +; CHECK-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] +; CHECK-NEXT: v_add_co_u32_e32 v2, vcc, v3, v5 +; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] + %and = and i64 %b, 63 + %shl = shl i64 %a, %and + %lshr = lshr i64 %a, %and + %ashr = ashr i64 %a, %and + %ret.0 = add i64 %shl, %lshr + %ret = add i64 %ret.0, %ashr + ret i64 %ret +} + +define amdgpu_ps i64 @s_csh_64(i64 inreg %a, i64 inreg %b) { +; CHECK-LABEL: s_csh_64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b64 s[4:5], s[0:1], s2 +; CHECK-NEXT: s_lshr_b64 s[6:7], s[0:1], s2 +; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 +; CHECK-NEXT: s_add_u32 s2, s4, s6 +; CHECK-NEXT: s_addc_u32 s3, s5, s7 +; CHECK-NEXT: s_add_u32 s0, s2, s0 +; CHECK-NEXT: s_addc_u32 s1, s3, s1 +; CHECK-NEXT: ; return to shader part epilog + %and = and i64 %b, 63 + %shl = shl i64 %a, %and + %lshr = lshr i64 %a, %and + %ashr = ashr i64 %a, %and + %ret.0 = add i64 %shl, %lshr + %ret = add i64 %ret.0, %ashr + ret i64 %ret +} + +define i32 @cshl_or(i32 %a, i32 %b) { +; CHECK-LABEL: cshl_or: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshl_or_b32 v0, v0, v1, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %b, 31 + %shl = shl i32 %a, %and + %or = or i32 %shl, %a + ret i32 %or +} + +define i32 @cshl_add(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: cshl_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshl_add_u32 v0, v0, v1, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %b, 31 + %shl = shl i32 %a, %and + %add = add i32 %shl, %c + ret i32 %add +} + +define i32 @add_cshl(i32 %a, i32 %b) { +; CHECK-LABEL: add_cshl: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_add_lshl_u32 v0, v0, v1, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %add = add i32 %a, %b + %and = and i32 %b, 31 + %shl = shl i32 %add, %and + ret i32 %shl +} diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -639,10 +639,8 @@ ; VI-LABEL: v_fshr_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_xor_b32_e32 v3, -1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; VI-NEXT: v_and_b32_e32 v3, 15, v3 -; VI-NEXT: v_and_b32_e32 v2, 15, v2 +; VI-NEXT: v_xor_b32_e32 v3, -1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -651,10 +649,8 @@ ; GFX9-LABEL: v_fshr_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -669,10 +665,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -701,18 +695,14 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; VI-NEXT: v_and_b32_e32 v4, 15, v3 ; VI-NEXT: v_mov_b32_e32 v5, 1 -; VI-NEXT: v_xor_b32_e32 v3, -1, v3 +; VI-NEXT: v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_and_b32_e32 v3, 15, v3 -; VI-NEXT: v_lshrrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_e32 v3, -1, v3 ; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5 ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_xor_b32_e32 v4, -1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; VI-NEXT: v_and_b32_e32 v4, 15, v4 -; VI-NEXT: v_and_b32_e32 v2, 15, v2 +; VI-NEXT: v_xor_b32_e32 v4, -1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -779,27 +769,21 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; VI-NEXT: v_and_b32_e32 v7, 15, v6 ; VI-NEXT: v_mov_b32_e32 v8, 1 -; VI-NEXT: v_xor_b32_e32 v6, -1, v6 +; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_and_b32_e32 v6, 15, v6 -; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_e32 v6, -1, v6 ; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8 ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_xor_b32_e32 v7, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; VI-NEXT: v_and_b32_e32 v7, 15, v7 -; VI-NEXT: v_and_b32_e32 v5, 15, v5 +; VI-NEXT: v_xor_b32_e32 v7, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v3 -; VI-NEXT: v_xor_b32_e32 v3, -1, v4 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; VI-NEXT: v_and_b32_e32 v3, 15, v3 +; VI-NEXT: v_xor_b32_e32 v3, -1, v4 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; VI-NEXT: v_and_b32_e32 v3, 15, v4 -; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2 +; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] @@ -808,27 +792,21 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v7, 15, v6 ; GFX9-NEXT: v_mov_b32_e32 v8, 1 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v4 -; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 +; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 @@ -844,31 +822,25 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX10-NEXT: v_and_b32_e32 v9, 15, v6 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 -; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v6 +; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_lshrrev_b16 v4, v9, v7 +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 +; GFX10-NEXT: v_lshrrev_b16 v4, v6, v9 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v6, v10 -; GFX10-NEXT: v_and_b32_e32 v7, 15, v11 +; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX10-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3 +; GFX10-NEXT: v_lshlrev_b16 v1, v2, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) ret <3 x i16> %ret @@ -905,34 +877,26 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; VI-NEXT: v_and_b32_e32 v7, 15, v6 ; VI-NEXT: v_mov_b32_e32 v8, 1 -; VI-NEXT: v_xor_b32_e32 v6, -1, v6 +; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_and_b32_e32 v6, 15, v6 -; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_e32 v6, -1, v6 ; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9 ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; VI-NEXT: v_and_b32_e32 v9, 15, v7 -; VI-NEXT: v_xor_b32_e32 v7, -1, v7 +; VI-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_and_b32_e32 v7, 15, v7 +; VI-NEXT: v_xor_b32_e32 v7, -1, v7 ; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8 -; VI-NEXT: v_xor_b32_e32 v8, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; VI-NEXT: v_and_b32_e32 v8, 15, v8 -; VI-NEXT: v_and_b32_e32 v5, 15, v5 +; VI-NEXT: v_xor_b32_e32 v8, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v3 -; VI-NEXT: v_xor_b32_e32 v3, -1, v4 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; VI-NEXT: v_and_b32_e32 v3, 15, v3 +; VI-NEXT: v_xor_b32_e32 v3, -1, v4 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; VI-NEXT: v_and_b32_e32 v3, 15, v4 -; VI-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2 +; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -943,34 +907,26 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v7, 15, v6 ; GFX9-NEXT: v_mov_b32_e32 v8, 1 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v9, 15, v7 -; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v4 -; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 +; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 @@ -989,40 +945,32 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v6 -; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 -; GFX10-NEXT: v_and_b32_e32 v13, 15, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v9, 15, v9 -; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 +; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7 +; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX10-NEXT: v_lshlrev_b16 v11, 1, v11 -; GFX10-NEXT: v_lshlrev_b16 v7, v9, v8 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 -; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 15, v9 -; GFX10-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v5 +; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 +; GFX10-NEXT: v_xor_b32_e32 v13, -1, v11 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 +; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, v12, v1 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_lshrrev_b16 v4, v13, v12 -; GFX10-NEXT: v_lshlrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v5, v9, v11 +; GFX10-NEXT: v_lshrrev_b16 v4, v11, v10 +; GFX10-NEXT: v_lshlrev_b16 v5, v13, v8 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX10-NEXT: v_and_b32_e32 v1, v2, v1 @@ -1037,11 +985,9 @@ ; SI-LABEL: v_fshr_i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 63, v4 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 ; SI-NEXT: v_not_b32_e32 v4, v4 -; SI-NEXT: v_and_b32_e32 v4, 63, v4 -; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v5 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1050,11 +996,9 @@ ; VI-LABEL: v_fshr_i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v5, 63, v4 ; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; VI-NEXT: v_not_b32_e32 v4, v4 -; VI-NEXT: v_and_b32_e32 v4, 63, v4 -; VI-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] ; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; VI-NEXT: v_or_b32_e32 v1, v1, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1063,11 +1007,9 @@ ; GFX9-LABEL: v_fshr_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX9-NEXT: v_not_b32_e32 v4, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1082,10 +1024,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 +; GFX10-NEXT: v_not_b32_e32 v5, v4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 @@ -1099,18 +1039,14 @@ ; SI-LABEL: v_fshr_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 63, v8 -; SI-NEXT: v_not_b32_e32 v8, v8 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; SI-NEXT: v_and_b32_e32 v8, 63, v8 -; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v9 +; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 +; SI-NEXT: v_not_b32_e32 v8, v8 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_and_b32_e32 v5, 63, v10 -; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v5 +; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v10 ; SI-NEXT: v_not_b32_e32 v7, v10 -; SI-NEXT: v_and_b32_e32 v7, 63, v7 ; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 ; SI-NEXT: v_or_b32_e32 v0, v0, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 @@ -1120,18 +1056,14 @@ ; VI-LABEL: v_fshr_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v9, 63, v8 ; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; VI-NEXT: v_not_b32_e32 v8, v8 -; VI-NEXT: v_and_b32_e32 v8, 63, v8 -; VI-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] ; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] ; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; VI-NEXT: v_or_b32_e32 v1, v1, v5 -; VI-NEXT: v_and_b32_e32 v5, 63, v10 -; VI-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7] ; VI-NEXT: v_not_b32_e32 v7, v10 -; VI-NEXT: v_and_b32_e32 v7, 63, v7 ; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] ; VI-NEXT: v_or_b32_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v3, v3, v6 @@ -1141,18 +1073,14 @@ ; GFX9-LABEL: v_fshr_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX9-NEXT: v_not_b32_e32 v8, v8 -; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX9-NEXT: v_and_b32_e32 v5, 63, v10 -; GFX9-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7] ; GFX9-NEXT: v_not_b32_e32 v7, v10 -; GFX9-NEXT: v_and_b32_e32 v7, 63, v7 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v6 @@ -1168,17 +1096,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_not_b32_e32 v9, v8 -; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 63, v9 -; GFX10-NEXT: v_and_b32_e32 v10, 63, v10 -; GFX10-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX10-NEXT: v_not_b32_e32 v9, v8 +; GFX10-NEXT: v_not_b32_e32 v11, v10 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -245,8 +245,7 @@ } ; GCN-LABEL: {{^}}trunc_shl_and31: -; GCN: s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 31 -; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}} +; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; GCN-NOT: v_lshl_b64 ; GCN-NOT: v_lshlrev_b64 define amdgpu_kernel void @trunc_shl_and31(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {