diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1858,6 +1858,11 @@ assert(!Known.hasConflict() && "Bits known to be one AND zero?"); assert(Known.getBitWidth() == InBits && "Src width has changed?"); Known = Known.zext(BitWidth); + + // Attempt to avoid multi-use ops if we don't need anything from them. + if (SDValue NewSrc = SimplifyMultipleUseDemandedBits( + Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1)) + return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc)); break; } case ISD::SIGN_EXTEND: @@ -1906,6 +1911,11 @@ if (!TLO.LegalOperations() || isOperationLegal(Opc, VT)) return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src)); } + + // Attempt to avoid multi-use ops if we don't need anything from them. + if (SDValue NewSrc = SimplifyMultipleUseDemandedBits( + Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1)) + return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc)); break; } case ISD::ANY_EXTEND: diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -493,11 +493,11 @@ ; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v3i16: @@ -515,27 +515,27 @@ ; SI-LABEL: v_bswap_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v3, v3, 8 -; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 +; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 ; SI-NEXT: s_mov_b32 s4, 0xff00ff ; SI-NEXT: s_mov_b32 s5, 0xffff0000 -; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8 -; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 -; SI-NEXT: v_alignbit_b32 v6, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_alignbit_b32 v7, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v4 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v6 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v7 +; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8 +; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 +; SI-NEXT: v_alignbit_b32 v7, v2, v2, 8 +; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 +; SI-NEXT: v_and_b32_e32 v4, s5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v3, s5, v3 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, s5, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -767,21 +767,21 @@ ; SI-NEXT: v_lshl_b32_e32 v0, v0, v7 ; SI-NEXT: v_or_b32_e32 v0, v0, v6 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; SI-NEXT: v_mov_b32_e32 v9, 0xffff ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_and_b32_e32 v3, 15, v8 +; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3 +; SI-NEXT: v_and_b32_e32 v10, s4, v5 +; SI-NEXT: v_lshr_b32_e32 v4, v10, v3 +; SI-NEXT: v_lshl_b32_e32 v2, v2, v6 +; SI-NEXT: v_mov_b32_e32 v9, 0xffff +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; SI-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, v9, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 15, v8 -; SI-NEXT: v_sub_i32_e32 v4, vcc, 16, v1 -; SI-NEXT: v_and_b32_e32 v10, s4, v5 -; SI-NEXT: v_lshr_b32_e32 v3, v10, v1 -; SI-NEXT: v_lshl_b32_e32 v2, v2, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; SI-NEXT: v_and_b32_e32 v2, v9, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, v9, v3 +; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v3i16: @@ -865,46 +865,46 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, 0xffff -; SI-NEXT: v_and_b32_e32 v11, 15, v11 -; SI-NEXT: v_and_b32_e32 v16, s4, v7 -; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v11 -; SI-NEXT: v_lshr_b32_e32 v16, v16, v11 -; SI-NEXT: v_lshl_b32_e32 v3, v3, v17 -; SI-NEXT: v_or_b32_e32 v3, v3, v16 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; SI-NEXT: v_and_b32_e32 v9, 15, v9 +; SI-NEXT: v_and_b32_e32 v16, s4, v5 +; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v9 +; SI-NEXT: v_lshr_b32_e32 v16, v16, v9 +; SI-NEXT: v_lshl_b32_e32 v1, v1, v17 +; SI-NEXT: v_or_b32_e32 v1, v1, v16 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SI-NEXT: v_and_b32_e32 v5, 15, v8 +; SI-NEXT: v_sub_i32_e32 v9, vcc, 16, v5 +; SI-NEXT: v_and_b32_e32 v15, s4, v4 +; SI-NEXT: v_lshr_b32_e32 v8, v15, v5 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v9 +; SI-NEXT: v_or_b32_e32 v0, v0, v8 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_and_b32_e32 v4, 15, v11 +; SI-NEXT: v_sub_i32_e32 v8, vcc, 16, v4 +; SI-NEXT: v_and_b32_e32 v14, s4, v7 +; SI-NEXT: v_lshr_b32_e32 v5, v14, v4 +; SI-NEXT: v_lshl_b32_e32 v3, v3, v8 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_and_b32_e32 v4, 15, v10 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; SI-NEXT: v_and_b32_e32 v7, 15, v10 -; SI-NEXT: v_sub_i32_e32 v11, vcc, 16, v7 -; SI-NEXT: v_and_b32_e32 v15, s4, v6 -; SI-NEXT: v_lshr_b32_e32 v10, v15, v7 -; SI-NEXT: v_lshl_b32_e32 v2, v2, v11 -; SI-NEXT: v_or_b32_e32 v2, v2, v10 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v4 +; SI-NEXT: v_and_b32_e32 v13, s4, v6 +; SI-NEXT: v_lshr_b32_e32 v5, v13, v4 +; SI-NEXT: v_lshl_b32_e32 v2, v2, v7 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; SI-NEXT: v_mov_b32_e32 v12, 0xffff ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, v12, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 15, v9 -; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v3 -; SI-NEXT: v_and_b32_e32 v14, s4, v5 -; SI-NEXT: v_lshr_b32_e32 v6, v14, v3 -; SI-NEXT: v_lshl_b32_e32 v1, v1, v7 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v6 -; SI-NEXT: v_and_b32_e32 v3, 15, v8 -; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3 -; SI-NEXT: v_and_b32_e32 v13, s4, v4 -; SI-NEXT: v_lshr_b32_e32 v5, v13, v3 -; SI-NEXT: v_lshl_b32_e32 v0, v0, v6 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v5 -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, v12, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -1470,21 +1470,21 @@ ; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40018 ; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40014 ; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40010 -; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40000 -; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40004 -; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40008 +; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_i32 s19, s5, 0x4000c +; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40000 ; GFX7-NEXT: s_ashr_i32 s14, s5, 28 -; GFX7-NEXT: s_bfe_i32 s5, s5, 0x4000c +; GFX7-NEXT: s_bfe_i32 s5, s5, 0x40004 ; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018 ; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014 ; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010 -; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40004 +; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c ; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40008 +; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40000 ; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: s_bfe_i32 s4, s4, 0x4000c +; GFX7-NEXT: s_bfe_i32 s4, s4, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2 @@ -1494,17 +1494,17 @@ ; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_and_b32_e32 v4, s8, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v3 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-NEXT: v_mov_b32_e32 v6, s16 ; GFX7-NEXT: v_mov_b32_e32 v7, s15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v5, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -1988,17 +1988,17 @@ ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40004 -; GFX7-NEXT: s_bfe_u32 s17, s5, 0x40004 -; GFX7-NEXT: s_bfe_u32 s19, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v4, s17 ; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 ; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 -; GFX7-NEXT: s_and_b32 s18, s5, 15 +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 ; GFX7-NEXT: s_lshr_b32 s13, s5, 28 -; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40008 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x4000c +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 @@ -2006,25 +2006,25 @@ ; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 ; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX7-NEXT: s_and_b32 s11, s4, 15 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v3, s18 -; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40008 +; GFX7-NEXT: s_and_b32 s4, s4, 15 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mul_u32_u24_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v4 -; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX7-NEXT: v_mov_b32_e32 v5, s16 ; GFX7-NEXT: v_mov_b32_e32 v6, s15 ; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -170,24 +170,24 @@ ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 ; GFX6-NEXT: s_movk_i32 s5, 0x8000 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 +; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v5 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v3i16: diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -170,25 +170,25 @@ ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 ; GFX6-NEXT: s_movk_i32 s5, 0x8000 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 ; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 ; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX6-NEXT: v_max_i32_e32 v3, s5, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v2, v5 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v3i16: diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -128,19 +128,19 @@ ; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_min_u32_e32 v3, s4, v2 ; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v5 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 -; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 +; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v3i16: diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -131,17 +131,17 @@ ; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_max_u32_e32 v1, v1, v8 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_and_b32_e32 v6, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v6 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v2, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v2, v6 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v1 -; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v3i16: diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -1947,18 +1947,18 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpslld %xmm4, %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 @@ -1967,18 +1967,18 @@ ; ; AVX512VL-LABEL: splatvar_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm1 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpslld %xmm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 @@ -1988,18 +1988,18 @@ ; AVX512BW-LABEL: splatvar_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 @@ -2009,18 +2009,18 @@ ; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm3, %ymm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 @@ -2029,18 +2029,18 @@ ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm3, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 @@ -2049,18 +2049,18 @@ ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %ymm3, %ymm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 ; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1967,18 +1967,18 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm4 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsrld %xmm5, %zmm3, %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm2, %xmm5, %xmm2 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsrld %xmm4, %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vpslld %xmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpslld %xmm4, %zmm0, %zmm0 ; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm2 +; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1987,18 +1987,18 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm4 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrld %xmm5, %zmm3, %zmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm5, %xmm2 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrld %xmm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpslld %xmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vpslld %xmm4, %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm2 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2008,17 +2008,17 @@ ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm4 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm5, %ymm3, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm2, %xmm5, %xmm2 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsllw %xmm4, %ymm0, %ymm0 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vptestnmb %zmm4, %zmm4, %k1 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -2029,17 +2029,17 @@ ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm5, %ymm3, %ymm3 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm5, %xmm2 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm3, %ymm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm4, %k1 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -2049,17 +2049,17 @@ ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm5, %ymm3, %ymm3 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm5, %xmm2 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm3, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmb %xmm4, %xmm4, %k1 +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq @@ -2068,17 +2068,17 @@ ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %ymm3, %ymm3 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm5, %xmm2 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %ymm3, %ymm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512VLVBMI2-NEXT: vptestnmb %xmm4, %xmm4, %k1 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq