diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3121,14 +3121,16 @@ if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; - ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); - - // fold (sub x, c) -> (add x, -c) - if (N1C) { - return DAG.getNode(ISD::ADD, DL, VT, N0, - DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); + // fold (sub x, c) -> (add x, (sub 0, c)) + if (isConstantOrConstantVector(N1, /* NoOpaques */ true)) { + SDValue NegC = DAG.FoldConstantArithmetic( + ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT).getNode(), N1.getNode()); + assert(NegC && "Constant-folding failed!"); + return DAG.getNode(ISD::ADD, DL, VT, N0, NegC); } + ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); + if (isNullOrNullSplat(N0)) { unsigned BitWidth = VT.getScalarSizeInBits(); // Right-shifting everything out but the sign bit followed by negation is diff --git a/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll b/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll --- a/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll +++ b/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll @@ -286,9 +286,9 @@ ; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: movi v1.4s, #8 +; CHECK-NEXT: mvni v1.4s, #7 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: bl vec_use ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload @@ -348,8 +348,8 @@ define <4 x i32> @vec_sub_const_sub_const(<4 x i32> %arg) { ; CHECK-LABEL: vec_sub_const_sub_const: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #10 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mvni v1.4s, #9 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %arg, %t1 = sub <4 x i32> %t0, @@ -363,14 +363,14 @@ ; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: movi v1.4s, #8 +; CHECK-NEXT: mvni v1.4s, #7 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: bl vec_use ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: movi v0.4s, #10 -; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s +; CHECK-NEXT: mvni v0.4s, #9 +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: add sp, sp, #32 // =32 ; CHECK-NEXT: ret %t0 = sub <4 x i32> %arg, @@ -384,7 +384,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI24_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %arg, %t1 = sub <4 x i32> %t0, @@ -442,13 +442,13 @@ ; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: movi v1.4s, #8 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mvni v1.4s, #7 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: bl vec_use ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: movi v0.4s, #2 +; CHECK-NEXT: movi v0.4s, #10 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add sp, sp, #32 // =32 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll --- a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll +++ b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll @@ -189,7 +189,7 @@ ; CHECK-NEXT: adrp x8, .LCPI14_0 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %a, %r = add <4 x i32> %t0, %b @@ -201,7 +201,7 @@ ; CHECK-NEXT: adrp x8, .LCPI15_0 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %a, %r = add <4 x i32> %b, %t0 @@ -257,7 +257,7 @@ ; CHECK-NEXT: adrp x8, .LCPI19_0 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_0] ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %t0 = add <4 x i32> %a, ; constant always on RHS %r = sub <4 x i32> %b, %t0 @@ -273,7 +273,7 @@ ; CHECK-NEXT: adrp x8, .LCPI20_0 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %a, %r = sub <4 x i32> %t0, %b @@ -313,7 +313,7 @@ ; CHECK-NEXT: adrp x8, .LCPI23_0 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> , %a %r = sub <4 x i32> %b, %t0 diff --git a/llvm/test/CodeGen/AArch64/vec_cttz.ll b/llvm/test/CodeGen/AArch64/vec_cttz.ll --- a/llvm/test/CodeGen/AArch64/vec_cttz.ll +++ b/llvm/test/CodeGen/AArch64/vec_cttz.ll @@ -40,8 +40,8 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %a) nounwind { ; CHECK-LABEL: cttz_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2s, #1 -; CHECK-NEXT: sub v1.2s, v0.2s, v1.2s +; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-NEXT: add v1.2s, v0.2s, v1.2s ; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b ; CHECK-NEXT: clz v0.2s, v0.2s ; CHECK-NEXT: movi v1.2s, #32 @@ -54,9 +54,8 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %a) nounwind { ; CHECK-LABEL: cttz_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: sub d1, d0, d1 +; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-NEXT: add d1, d0, d1 ; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b ; CHECK-NEXT: cnt v0.8b, v0.8b ; CHECK-NEXT: uaddlp v0.4h, v0.8b @@ -96,8 +95,8 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %a) nounwind { ; CHECK-LABEL: cttz_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: sub v1.4s, v0.4s, v1.4s +; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-NEXT: add v1.4s, v0.4s, v1.4s ; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: clz v0.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #32 @@ -110,9 +109,8 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %a) nounwind { ; CHECK-LABEL: cttz_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1 -; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: sub v1.2d, v0.2d, v1.2d +; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-NEXT: add v1.2d, v0.2d, v1.2d ; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: uaddlp v0.8h, v0.16b diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -785,7 +785,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 64 op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -843,7 +843,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x400007 +; GFX9-NEXT: s_mov_b32 s4, 0xffc0fff9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 @@ -853,7 +853,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4 +; GFX9-NEXT: v_pk_add_u16 v2, v3, s4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -911,7 +911,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x7b0040 +; GFX9-NEXT: s_mov_b32 s4, 0xff85ffc0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 @@ -921,7 +921,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4 +; GFX9-NEXT: v_pk_add_u16 v2, v3, s4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -987,7 +987,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 7 +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 7 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1050,7 +1050,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1112,7 +1112,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1174,7 +1174,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -168,7 +168,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 0x1c8007b +; GFX9-NEXT: s_mov_b32 s8, 0xfe38ff85 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -179,7 +179,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s8 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -218,7 +218,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 0xfc21fcb3 +; GFX9-NEXT: s_mov_b32 s8, 0x3df034d ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -229,7 +229,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s8 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -277,7 +277,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -325,7 +325,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -363,7 +363,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 1.0 +; GFX9-NEXT: s_mov_b32 s8, -4.0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -374,7 +374,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s0 ; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 +; GFX9-NEXT: v_pk_add_u16 v0, v0, s8 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/Mips/msa/arithmetic.ll b/llvm/test/CodeGen/Mips/msa/arithmetic.ll --- a/llvm/test/CodeGen/Mips/msa/arithmetic.ll +++ b/llvm/test/CodeGen/Mips/msa/arithmetic.ll @@ -180,7 +180,8 @@ ; ALL-LABEL: sub_v16i8_i: ; ALL: # %bb.0: ; ALL-NEXT: ld.b $w0, 0($5) -; ALL-NEXT: subvi.b $w0, $w0, 1 +; ALL-NEXT: ldi.b $w1, -1 +; ALL-NEXT: addv.b $w0, $w0, $w1 ; ALL-NEXT: jr $ra ; ALL-NEXT: st.b $w0, 0($4) %1 = load <16 x i8>, <16 x i8>* %a @@ -193,8 +194,9 @@ define void @sub_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind { ; ALL-LABEL: sub_v8i16_i: ; ALL: # %bb.0: -; ALL-NEXT: ld.h $w0, 0($5) -; ALL-NEXT: subvi.h $w0, $w0, 1 +; ALL-NEXT: ldi.b $w0, -1 +; ALL-NEXT: ld.h $w1, 0($5) +; ALL-NEXT: addv.h $w0, $w1, $w0 ; ALL-NEXT: jr $ra ; ALL-NEXT: st.h $w0, 0($4) %1 = load <8 x i16>, <8 x i16>* %a @@ -207,8 +209,9 @@ define void @sub_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind { ; ALL-LABEL: sub_v4i32_i: ; ALL: # %bb.0: -; ALL-NEXT: ld.w $w0, 0($5) -; ALL-NEXT: subvi.w $w0, $w0, 1 +; ALL-NEXT: ldi.b $w0, -1 +; ALL-NEXT: ld.w $w1, 0($5) +; ALL-NEXT: addv.w $w0, $w1, $w0 ; ALL-NEXT: jr $ra ; ALL-NEXT: st.w $w0, 0($4) %1 = load <4 x i32>, <4 x i32>* %a @@ -218,12 +221,22 @@ } define void @sub_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind { -; ALL-LABEL: sub_v2i64_i: -; ALL: # %bb.0: -; ALL-NEXT: ld.d $w0, 0($5) -; ALL-NEXT: subvi.d $w0, $w0, 1 -; ALL-NEXT: jr $ra -; ALL-NEXT: st.d $w0, 0($4) +; MIPS-LABEL: sub_v2i64_i: +; MIPS: # %bb.0: +; MIPS-NEXT: ldi.b $w0, -1 +; MIPS-NEXT: shf.w $w0, $w0, 177 +; MIPS-NEXT: ld.d $w1, 0($5) +; MIPS-NEXT: addv.d $w0, $w1, $w0 +; MIPS-NEXT: jr $ra +; MIPS-NEXT: st.d $w0, 0($4) +; +; MIPSEL-LABEL: sub_v2i64_i: +; MIPSEL: # %bb.0: +; MIPSEL-NEXT: ldi.b $w0, -1 +; MIPSEL-NEXT: ld.d $w1, 0($5) +; MIPSEL-NEXT: addv.d $w0, $w1, $w0 +; MIPSEL-NEXT: jr $ra +; MIPSEL-NEXT: st.d $w0, 0($4) %1 = load <2 x i64>, <2 x i64>* %a %2 = sub <2 x i64> %1, store <2 x i64> %2, <2 x i64>* %c diff --git a/llvm/test/CodeGen/Mips/msa/i5-s.ll b/llvm/test/CodeGen/Mips/msa/i5-s.ll --- a/llvm/test/CodeGen/Mips/msa/i5-s.ll +++ b/llvm/test/CodeGen/Mips/msa/i5-s.ll @@ -59,12 +59,13 @@ define void @llvm_mips_subvi_w_test() nounwind { ; ALL-LABEL: llvm_mips_subvi_w_test: ; ALL: # %bb.0: # %entry +; ALL-NEXT: lui $1, %hi(llvm_mips_subvi_w_ARG1) +; ALL-NEXT: addiu $1, $1, %lo(llvm_mips_subvi_w_ARG1) +; ALL-NEXT: ld.w $w0, 0($1) +; ALL-NEXT: ldi.w $w1, -14 +; ALL-NEXT: addv.w $w0, $w0, $w1 ; ALL-NEXT: lui $1, %hi(llvm_mips_subvi_w_RES) ; ALL-NEXT: addiu $1, $1, %lo(llvm_mips_subvi_w_RES) -; ALL-NEXT: lui $2, %hi(llvm_mips_subvi_w_ARG1) -; ALL-NEXT: addiu $2, $2, %lo(llvm_mips_subvi_w_ARG1) -; ALL-NEXT: ld.w $w0, 0($2) -; ALL-NEXT: subvi.w $w0, $w0, 14 ; ALL-NEXT: jr $ra ; ALL-NEXT: st.w $w0, 0($1) entry: diff --git a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll --- a/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll +++ b/llvm/test/CodeGen/PowerPC/vec_add_sub_doubleword.ll @@ -80,8 +80,10 @@ define <2 x i64> @decrement_by_one(<2 x i64> %x) nounwind { ; VSX-LABEL: decrement_by_one: ; VSX: # %bb.0: -; VSX-NEXT: xxleqv 35, 35, 35 -; VSX-NEXT: vsubudm 2, 2, 3 +; VSX-NEXT: addis 3, 2, .LCPI4_0@toc@ha +; VSX-NEXT: addi 3, 3, .LCPI4_0@toc@l +; VSX-NEXT: lxvd2x 35, 0, 3 +; VSX-NEXT: vaddudm 2, 2, 3 ; VSX-NEXT: blr ; ; NOVSX-LABEL: decrement_by_one: @@ -89,7 +91,7 @@ ; NOVSX-NEXT: addis 3, 2, .LCPI4_0@toc@ha ; NOVSX-NEXT: addi 3, 3, .LCPI4_0@toc@l ; NOVSX-NEXT: lvx 3, 0, 3 -; NOVSX-NEXT: vsubudm 2, 2, 3 +; NOVSX-NEXT: vaddudm 2, 2, 3 ; NOVSX-NEXT: blr %result = sub <2 x i64> %x, ret <2 x i64> %result diff --git a/llvm/test/CodeGen/PowerPC/vec_add_sub_quadword.ll b/llvm/test/CodeGen/PowerPC/vec_add_sub_quadword.ll --- a/llvm/test/CodeGen/PowerPC/vec_add_sub_quadword.ll +++ b/llvm/test/CodeGen/PowerPC/vec_add_sub_quadword.ll @@ -76,10 +76,8 @@ define <1 x i128> @decrement_by_one(<1 x i128> %x) nounwind { ; VSX-LABEL: decrement_by_one: ; VSX: # %bb.0: -; VSX-NEXT: addis 3, 2, .LCPI5_0@toc@ha -; VSX-NEXT: addi 3, 3, .LCPI5_0@toc@l -; VSX-NEXT: lxvd2x 35, 0, 3 -; VSX-NEXT: vsubuqm 2, 2, 3 +; VSX-NEXT: xxleqv 35, 35, 35 +; VSX-NEXT: vadduqm 2, 2, 3 ; VSX-NEXT: blr ; ; NOVSX-LABEL: decrement_by_one: @@ -87,7 +85,7 @@ ; NOVSX-NEXT: addis 3, 2, .LCPI5_0@toc@ha ; NOVSX-NEXT: addi 3, 3, .LCPI5_0@toc@l ; NOVSX-NEXT: lvx 3, 0, 3 -; NOVSX-NEXT: vsubuqm 2, 2, 3 +; NOVSX-NEXT: vadduqm 2, 2, 3 ; NOVSX-NEXT: blr %result = sub <1 x i128> %x, ret <1 x i128> %result diff --git a/llvm/test/CodeGen/PowerPC/vec_clz.ll b/llvm/test/CodeGen/PowerPC/vec_clz.ll --- a/llvm/test/CodeGen/PowerPC/vec_clz.ll +++ b/llvm/test/CodeGen/PowerPC/vec_clz.ll @@ -85,16 +85,16 @@ define <2 x i32> @illegal_cttz(<2 x i32> %v1) { ; CHECK-LABEL: illegal_cttz: ; CHECK: # %bb.0: -; CHECK-NEXT: vspltisw 3, 1 -; CHECK-NEXT: vsubuwm 3, 2, 3 +; CHECK-NEXT: xxleqv 35, 35, 35 +; CHECK-NEXT: vadduwm 3, 2, 3 ; CHECK-NEXT: xxlandc 34, 35, 34 ; CHECK-NEXT: vpopcntw 2, 2 ; CHECK-NEXT: blr ; ; CHECK-NOVSX-LABEL: illegal_cttz: ; CHECK-NOVSX: # %bb.0: -; CHECK-NOVSX-NEXT: vspltisw 3, 1 -; CHECK-NOVSX-NEXT: vsubuwm 3, 2, 3 +; CHECK-NOVSX-NEXT: vspltisb 3, -1 +; CHECK-NOVSX-NEXT: vadduwm 3, 2, 3 ; CHECK-NOVSX-NEXT: vandc 2, 3, 2 ; CHECK-NOVSX-NEXT: vpopcntw 2, 2 ; CHECK-NOVSX-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/vec_splat.ll b/llvm/test/CodeGen/PowerPC/vec_splat.ll --- a/llvm/test/CodeGen/PowerPC/vec_splat.ll +++ b/llvm/test/CodeGen/PowerPC/vec_splat.ll @@ -247,8 +247,10 @@ ; G5-LABEL: spltish: ; G5: # %bb.0: ; G5-NEXT: lvx 2, 0, 4 -; G5-NEXT: vspltish 3, 15 -; G5-NEXT: vsububm 2, 2, 3 +; G5-NEXT: li 4, .LCPI5_0@l +; G5-NEXT: lis 5, .LCPI5_0@ha +; G5-NEXT: lvx 3, 5, 4 +; G5-NEXT: vaddubm 2, 2, 3 ; G5-NEXT: stvx 2, 0, 3 ; G5-NEXT: blr %tmp = load <16 x i8>, <16 x i8>* %B ; <<16 x i8>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/addsub-constant-folding.ll b/llvm/test/CodeGen/X86/addsub-constant-folding.ll --- a/llvm/test/CodeGen/X86/addsub-constant-folding.ll +++ b/llvm/test/CodeGen/X86/addsub-constant-folding.ll @@ -457,8 +457,10 @@ ; X86: # %bb.0: ; X86-NEXT: subl $28, %esp ; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill -; X86-NEXT: psubd {{\.LCPI.*}}, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm0 = [4294967288,4294967288,4294967288,4294967288] +; X86-NEXT: paddd %xmm1, %xmm0 ; X86-NEXT: calll vec_use ; X86-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload ; X86-NEXT: paddd {{\.LCPI.*}}, %xmm0 @@ -470,8 +472,10 @@ ; X64: # %bb.0: ; X64-NEXT: subq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm0 = [4294967288,4294967288,4294967288,4294967288] +; X64-NEXT: paddd %xmm1, %xmm0 ; X64-NEXT: callq vec_use ; X64-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; X64-NEXT: paddd {{.*}}(%rip), %xmm0 @@ -558,12 +562,12 @@ define <4 x i32> @vec_sub_const_sub_const(<4 x i32> %arg) { ; X86-LABEL: vec_sub_const_sub_const: ; X86: # %bb.0: -; X86-NEXT: psubd {{\.LCPI.*}}, %xmm0 +; X86-NEXT: paddd {{\.LCPI.*}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_sub_const_sub_const: ; X64: # %bb.0: -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> %arg, %t1 = sub <4 x i32> %t0, @@ -575,11 +579,13 @@ ; X86: # %bb.0: ; X86-NEXT: subl $28, %esp ; X86-NEXT: .cfi_def_cfa_offset 32 +; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill -; X86-NEXT: psubd {{\.LCPI.*}}, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm0 = [4294967288,4294967288,4294967288,4294967288] +; X86-NEXT: paddd %xmm1, %xmm0 ; X86-NEXT: calll vec_use ; X86-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload -; X86-NEXT: psubd {{\.LCPI.*}}, %xmm0 +; X86-NEXT: paddd {{\.LCPI.*}}, %xmm0 ; X86-NEXT: addl $28, %esp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -588,11 +594,13 @@ ; X64: # %bb.0: ; X64-NEXT: subq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm0 = [4294967288,4294967288,4294967288,4294967288] +; X64-NEXT: paddd %xmm1, %xmm0 ; X64-NEXT: callq vec_use ; X64-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: addq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: retq @@ -605,12 +613,12 @@ define <4 x i32> @vec_sub_const_sub_const_nonsplat(<4 x i32> %arg) { ; X86-LABEL: vec_sub_const_sub_const_nonsplat: ; X86: # %bb.0: -; X86-NEXT: psubd {{\.LCPI.*}}, %xmm0 +; X86-NEXT: paddd {{\.LCPI.*}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: vec_sub_const_sub_const_nonsplat: ; X64: # %bb.0: -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> %arg, %t1 = sub <4 x i32> %t0, @@ -698,10 +706,12 @@ ; X86: # %bb.0: ; X86-NEXT: subl $28, %esp ; X86-NEXT: .cfi_def_cfa_offset 32 -; X86-NEXT: psubd {{\.LCPI.*}}, %xmm0 +; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill +; X86-NEXT: movdqa {{.*#+}} xmm0 = [4294967288,4294967288,4294967288,4294967288] +; X86-NEXT: paddd %xmm1, %xmm0 ; X86-NEXT: calll vec_use -; X86-NEXT: movdqa {{.*#+}} xmm0 = [2,2,2,2] +; X86-NEXT: movdqa {{.*#+}} xmm0 = [10,10,10,10] ; X86-NEXT: movdqu (%esp), %xmm1 # 16-byte Reload ; X86-NEXT: psubd %xmm1, %xmm0 ; X86-NEXT: addl $28, %esp @@ -712,10 +722,12 @@ ; X64: # %bb.0: ; X64-NEXT: subq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 32 -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 ; X64-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; X64-NEXT: movdqa {{.*#+}} xmm0 = [4294967288,4294967288,4294967288,4294967288] +; X64-NEXT: paddd %xmm1, %xmm0 ; X64-NEXT: callq vec_use -; X64-NEXT: movdqa {{.*#+}} xmm0 = [2,2,2,2] +; X64-NEXT: movdqa {{.*#+}} xmm0 = [10,10,10,10] ; X64-NEXT: psubd (%rsp), %xmm0 # 16-byte Folded Reload ; X64-NEXT: addq $24, %rsp ; X64-NEXT: .cfi_def_cfa_offset 8 diff --git a/llvm/test/CodeGen/X86/avx-shift.ll b/llvm/test/CodeGen/X86/avx-shift.ll --- a/llvm/test/CodeGen/X86/avx-shift.ll +++ b/llvm/test/CodeGen/X86/avx-shift.ll @@ -109,11 +109,12 @@ ; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; CHECK-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; CHECK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; CHECK-NEXT: vpaddb %xmm4, %xmm1, %xmm1 ; CHECK-NEXT: vpsrlw $2, %xmm0, %xmm0 ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; CHECK-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %s = ashr <32 x i8> %a, diff --git a/llvm/test/CodeGen/X86/avx2-shift.ll b/llvm/test/CodeGen/X86/avx2-shift.ll --- a/llvm/test/CodeGen/X86/avx2-shift.ll +++ b/llvm/test/CodeGen/X86/avx2-shift.ll @@ -473,18 +473,16 @@ ; X32: # %bb.0: ; X32-NEXT: vpsrlw $3, %ymm0, %ymm0 ; X32-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 -; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X32-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; X32-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpxor {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: sra_v32i8: ; X64: # %bb.0: ; X64-NEXT: vpsrlw $3, %ymm0, %ymm0 ; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X64-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %B = ashr <32 x i8> %A, ret <32 x i8> %B diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1630,9 +1630,8 @@ ; SSE2-NEXT: psrlq $3, %xmm1 ; SSE2-NEXT: psrlq $4, %xmm2 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] -; SSE2-NEXT: xorpd %xmm1, %xmm2 -; SSE2-NEXT: psubq %xmm1, %xmm2 +; SSE2-NEXT: xorpd {{.*}}(%rip), %xmm2 +; SSE2-NEXT: paddq {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; @@ -1659,9 +1658,8 @@ ; SSE41-NEXT: psrlq $4, %xmm1 ; SSE41-NEXT: psrlq $3, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] -; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: psubq %xmm1, %xmm2 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2 +; SSE41-NEXT: paddq {{.*}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm1 ; SSE41-NEXT: retq ; @@ -1677,9 +1675,8 @@ ; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3 ; AVX1-NEXT: vpsrlq $3, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488] -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 @@ -1697,9 +1694,8 @@ ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX2-NEXT: retq ; @@ -1781,21 +1777,22 @@ ; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] ; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] ; SSE2-NEXT: xorpd %xmm1, %xmm4 -; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [17293822569102704640,17870283321406128128] +; SSE2-NEXT: paddq %xmm6, %xmm4 ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: psrlq $61, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: psrlq $61, %xmm7 ; SSE2-NEXT: psrlq $60, %xmm5 -; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] +; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] ; SSE2-NEXT: paddq %xmm3, %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: psrlq $3, %xmm3 ; SSE2-NEXT: psrlq $4, %xmm5 ; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] ; SSE2-NEXT: xorpd %xmm1, %xmm5 -; SSE2-NEXT: psubq %xmm1, %xmm5 +; SSE2-NEXT: paddq %xmm6, %xmm5 ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: retq @@ -1836,21 +1833,22 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488] ; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: psubq %xmm5, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [17293822569102704640,17870283321406128128] +; SSE41-NEXT: paddq %xmm6, %xmm1 ; SSE41-NEXT: movdqa %xmm3, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: psrlq $60, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: psrlq $60, %xmm7 ; SSE41-NEXT: psrlq $61, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7] ; SSE41-NEXT: paddq %xmm3, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: psrlq $4, %xmm3 ; SSE41-NEXT: psrlq $3, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: psubq %xmm5, %xmm4 +; SSE41-NEXT: paddq %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: retq ; @@ -1868,26 +1866,27 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5 -; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vpsrad $2, %xmm5, %xmm6 -; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [17293822569102704640,17870283321406128128] +; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm6 +; AVX1-NEXT: vpsrlq $62, %xmm6, %xmm6 +; AVX1-NEXT: vpaddq %xmm6, %xmm0, %xmm6 +; AVX1-NEXT: vpsrad $2, %xmm6, %xmm7 +; AVX1-NEXT: vpsrlq $2, %xmm6, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5 -; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm6 -; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm6 +; AVX1-NEXT: vpsrlq $60, %xmm6, %xmm7 +; AVX1-NEXT: vpsrlq $61, %xmm6, %xmm6 +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4,5,6,7] +; AVX1-NEXT: vpaddq %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm6 ; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7] ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2 ; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2 @@ -1909,14 +1908,15 @@ ; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrlvq %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlvq %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm7, %ymm2, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll --- a/llvm/test/CodeGen/X86/combine-sub.ll +++ b/llvm/test/CodeGen/X86/combine-sub.ll @@ -34,12 +34,12 @@ define <4 x i32> @combine_vec_sub_constant(<4 x i32> %x) { ; SSE-LABEL: combine_vec_sub_constant: ; SSE: # %bb.0: -; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_sub_constant: ; AVX: # %bb.0: -; AVX-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = sub <4 x i32> %x, ret <4 x i32> %1 diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -271,9 +271,8 @@ ; X64-NEXT: vpsrlq $60, %xmm0, %xmm2 ; X64-NEXT: vpsrlq $61, %xmm0, %xmm0 ; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [4,8] -; X64-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vpmovsxdq %xmm1, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm2 ; X64-NEXT: vpor %xmm1, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -213,10 +213,11 @@ ; X64-SSE-NEXT: psrlq $63, %xmm0 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,9223372036854775808] ; X64-SSE-NEXT: pxor %xmm2, %xmm0 -; X64-SSE-NEXT: psubq %xmm2, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551615,9223372036854775808] +; X64-SSE-NEXT: paddq %xmm3, %xmm0 ; X64-SSE-NEXT: psrlq $63, %xmm1 ; X64-SSE-NEXT: pxor %xmm2, %xmm1 -; X64-SSE-NEXT: psubq %xmm2, %xmm1 +; X64-SSE-NEXT: paddq %xmm3, %xmm1 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-SSE-NEXT: packssdw %xmm1, %xmm0 @@ -230,11 +231,12 @@ ; X64-AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9223372036854775808] ; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551615,9223372036854775808] +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -245,11 +247,9 @@ ; X64-AVX2-LABEL: trunc_ashr_v4i64_demandedelts: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9223372036854775808,1,9223372036854775808] -; X64-AVX2-NEXT: # ymm1 = mem[0,1,0,1] -; X64-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll @@ -10,7 +10,7 @@ ; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX256-NEXT: vplzcntd %ymm0, %ymm0 ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX256-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq ; @@ -19,7 +19,7 @@ ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -28,7 +28,7 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 false) @@ -54,7 +54,7 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 false) @@ -68,12 +68,12 @@ ; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX256-NEXT: vplzcntd %ymm1, %ymm1 ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 -; AVX256-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] -; AVX256-NEXT: vpsubw %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vmovdqa {{.*#+}} xmm2 = [65520,65520,65520,65520,65520,65520,65520,65520] +; AVX256-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX256-NEXT: vplzcntd %ymm0, %ymm0 ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 -; AVX256-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX256-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX256-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX256-NEXT: retq ; @@ -82,7 +82,7 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 false) ret <16 x i16> %out @@ -108,12 +108,12 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232] +; AVX512-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 false) diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -490,9 +490,8 @@ ; SSE-NEXT: paddsb %xmm1, %xmm0 ; SSE-NEXT: psrlw $4, %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: @@ -505,9 +504,8 @@ ; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z diff --git a/llvm/test/CodeGen/X86/sink-addsub-of-const.ll b/llvm/test/CodeGen/X86/sink-addsub-of-const.ll --- a/llvm/test/CodeGen/X86/sink-addsub-of-const.ll +++ b/llvm/test/CodeGen/X86/sink-addsub-of-const.ll @@ -297,13 +297,13 @@ ; X32-LABEL: vec_sink_sub_of_const_to_add0: ; X32: # %bb.0: ; X32-NEXT: paddd %xmm1, %xmm0 -; X32-NEXT: psubd {{\.LCPI.*}}, %xmm0 +; X32-NEXT: paddd {{\.LCPI.*}}, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: vec_sink_sub_of_const_to_add0: ; X64: # %bb.0: ; X64-NEXT: paddd %xmm1, %xmm0 -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> %a, %r = add <4 x i32> %t0, %b @@ -313,13 +313,13 @@ ; X32-LABEL: vec_sink_sub_of_const_to_add1: ; X32: # %bb.0: ; X32-NEXT: paddd %xmm1, %xmm0 -; X32-NEXT: psubd {{\.LCPI.*}}, %xmm0 +; X32-NEXT: paddd {{\.LCPI.*}}, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: vec_sink_sub_of_const_to_add1: ; X64: # %bb.0: ; X64-NEXT: paddd %xmm1, %xmm0 -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> %a, %r = add <4 x i32> %b, %t0 @@ -389,14 +389,14 @@ ; X32-LABEL: vec_sink_add_of_const_to_sub2: ; X32: # %bb.0: ; X32-NEXT: psubd %xmm0, %xmm1 -; X32-NEXT: psubd {{\.LCPI.*}}, %xmm1 +; X32-NEXT: paddd {{\.LCPI.*}}, %xmm1 ; X32-NEXT: movdqa %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: vec_sink_add_of_const_to_sub2: ; X64: # %bb.0: ; X64-NEXT: psubd %xmm0, %xmm1 -; X64-NEXT: psubd {{.*}}(%rip), %xmm1 +; X64-NEXT: paddd {{.*}}(%rip), %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq %t0 = add <4 x i32> %a, ; constant always on RHS @@ -411,13 +411,13 @@ ; X32-LABEL: vec_sink_sub_of_const_to_sub: ; X32: # %bb.0: ; X32-NEXT: psubd %xmm1, %xmm0 -; X32-NEXT: psubd {{\.LCPI.*}}, %xmm0 +; X32-NEXT: paddd {{\.LCPI.*}}, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: vec_sink_sub_of_const_to_sub: ; X64: # %bb.0: ; X64-NEXT: psubd %xmm1, %xmm0 -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> %a, %r = sub <4 x i32> %t0, %b @@ -461,13 +461,13 @@ ; X32-LABEL: vec_sink_sub_from_const_to_sub2: ; X32: # %bb.0: ; X32-NEXT: paddd %xmm1, %xmm0 -; X32-NEXT: psubd {{\.LCPI.*}}, %xmm0 +; X32-NEXT: paddd {{\.LCPI.*}}, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: vec_sink_sub_from_const_to_sub2: ; X64: # %bb.0: ; X64-NEXT: paddd %xmm1, %xmm0 -; X64-NEXT: psubd {{.*}}(%rip), %xmm0 +; X64-NEXT: paddd {{.*}}(%rip), %xmm0 ; X64-NEXT: retq %t0 = sub <4 x i32> , %a %r = sub <4 x i32> %b, %t0 diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -490,9 +490,8 @@ ; SSE-NEXT: psubsb %xmm1, %xmm0 ; SSE-NEXT: psrlw $4, %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: @@ -505,9 +504,8 @@ ; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -186,7 +186,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm3 -; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm4 +; XOPAVX1-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm4 ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -431,7 +431,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm3 -; XOPAVX1-NEXT: vpsubd {{.*}}(%rip), %xmm2, %xmm4 +; XOPAVX1-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm4 ; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -756,7 +756,7 @@ ; XOP: # %bb.0: ; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm3 -; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm2, %xmm4 +; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm4 ; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -1099,7 +1099,7 @@ ; XOP: # %bb.0: ; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm3 -; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm4 +; XOP-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm4 ; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpor %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -2077,7 +2077,7 @@ ; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm4 -; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm5 +; XOPAVX1-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm5 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 ; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2 @@ -2089,7 +2089,7 @@ ; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm3 -; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm2, %xmm4 +; XOPAVX2-NEXT: vpor {{.*}}(%rip), %xmm2, %xmm4 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -153,11 +153,11 @@ ; XOPAVX1-NEXT: vpshlq %xmm4, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm5 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64] -; XOPAVX1-NEXT: vpsubq %xmm5, %xmm4, %xmm6 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551552,18446744073709551552] +; XOPAVX1-NEXT: vpor %xmm5, %xmm4, %xmm6 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; XOPAVX1-NEXT: vpshlq %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm5 +; XOPAVX1-NEXT: vpor %xmm5, %xmm2, %xmm5 ; XOPAVX1-NEXT: vpshlq %xmm5, %xmm1, %xmm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 @@ -334,11 +334,11 @@ ; XOPAVX1-NEXT: vpshld %xmm4, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm5 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] -; XOPAVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm6 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4294967264,4294967264,4294967264,4294967264] +; XOPAVX1-NEXT: vpor %xmm5, %xmm4, %xmm6 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; XOPAVX1-NEXT: vpshld %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm5 +; XOPAVX1-NEXT: vpor %xmm5, %xmm2, %xmm5 ; XOPAVX1-NEXT: vpshld %xmm5, %xmm1, %xmm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 @@ -558,11 +558,11 @@ ; XOPAVX1-NEXT: vpshlw %xmm4, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm5 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; XOPAVX1-NEXT: vpsubw %xmm5, %xmm4, %xmm6 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [65520,65520,65520,65520,65520,65520,65520,65520] +; XOPAVX1-NEXT: vpor %xmm5, %xmm4, %xmm6 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; XOPAVX1-NEXT: vpshlw %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubw %xmm5, %xmm2, %xmm5 +; XOPAVX1-NEXT: vpor %xmm5, %xmm2, %xmm5 ; XOPAVX1-NEXT: vpshlw %xmm5, %xmm1, %xmm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 @@ -857,11 +857,11 @@ ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm5 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; XOPAVX1-NEXT: vpsubb %xmm5, %xmm4, %xmm6 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; XOPAVX1-NEXT: vpor %xmm5, %xmm4, %xmm6 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; XOPAVX1-NEXT: vpshlb %xmm6, %xmm7, %xmm6 -; XOPAVX1-NEXT: vpsubb %xmm5, %xmm2, %xmm5 +; XOPAVX1-NEXT: vpor %xmm5, %xmm2, %xmm5 ; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm3, %ymm1 @@ -1646,11 +1646,11 @@ ; XOPAVX1-NEXT: vpshlb %xmm4, %xmm5, %xmm5 ; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm6 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; XOPAVX1-NEXT: vpsubb %xmm6, %xmm4, %xmm7 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; XOPAVX1-NEXT: vpor %xmm6, %xmm4, %xmm7 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm7, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm6 +; XOPAVX1-NEXT: vpor %xmm6, %xmm2, %xmm6 ; XOPAVX1-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; XOPAVX1-NEXT: vorps %ymm1, %ymm5, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -182,14 +182,13 @@ ; SSE2-NEXT: packuswb %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: psrlw $7, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psrlw $7, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: psubb %xmm2, %xmm1 +; SSE2-NEXT: paddb {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -206,14 +205,13 @@ ; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: paddb %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $2, %xmm0 +; SSE41-NEXT: psrlw $7, %xmm0 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psrlw $7, %xmm1 +; SSE41-NEXT: psrlw $2, %xmm1 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1 ; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm2, %xmm1 +; SSE41-NEXT: paddb {{.*}}(%rip), %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -229,14 +227,13 @@ ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_div7_16i8: @@ -247,14 +244,13 @@ ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm1 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; @@ -265,14 +261,13 @@ ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1 +; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = sdiv <16 x i8> %a, @@ -641,31 +636,30 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE2-LABEL: test_rem7_16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427] -; SSE2-NEXT: pmullw %xmm3, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427] +; SSE2-NEXT: pmullw %xmm2, %xmm1 ; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; SSE2-NEXT: pxor %xmm3, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: psrlw $8, %xmm3 +; SSE2-NEXT: packuswb %xmm1, %xmm3 +; SSE2-NEXT: paddb %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: psrlw $7, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: paddb %xmm2, %xmm1 -; SSE2-NEXT: psubb %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psllw $3, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: psubb %xmm2, %xmm1 -; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm3 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; SSE2-NEXT: paddb %xmm1, %xmm3 +; SSE2-NEXT: paddb {{.*}}(%rip), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psllw $3, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm3 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_rem7_16i8: @@ -681,14 +675,13 @@ ; SSE41-NEXT: packuswb %xmm3, %xmm1 ; SSE41-NEXT: paddb %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $2, %xmm2 +; SSE41-NEXT: psrlw $7, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; SSE41-NEXT: pxor %xmm3, %xmm2 -; SSE41-NEXT: psrlw $7, %xmm1 +; SSE41-NEXT: psrlw $2, %xmm1 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1 ; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: psubb %xmm3, %xmm1 +; SSE41-NEXT: paddb {{.*}}(%rip), %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psllw $3, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 @@ -708,14 +701,13 @@ ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm2 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 @@ -730,14 +722,13 @@ ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 +; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm2 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpxor {{.*}}(%rip), %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpaddb {{.*}}(%rip), %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 @@ -752,14 +743,13 @@ ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 +; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm2 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -170,31 +170,32 @@ ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; AVX1-NEXT: vpaddb %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 -; AVX1-NEXT: vpmullw %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 +; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -211,14 +212,13 @@ ; AVX2NOBW-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm1 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm0 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_div7_32i8: @@ -228,14 +228,13 @@ ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm1 +; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm1 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpsubb %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BW-NEXT: retq %res = sdiv <32 x i8> %a, ret <32 x i8> %res @@ -584,15 +583,16 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX1-NEXT: vpand %xmm9, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX1-NEXT: vpxor %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 @@ -607,12 +607,12 @@ ; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm9, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -631,14 +631,13 @@ ; AVX2NOBW-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm2 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX2NOBW-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX2NOBW-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsllw $3, %ymm1, %ymm2 ; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 @@ -652,14 +651,13 @@ ; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2 +; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm2 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1 +; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1 +; AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 ; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -149,14 +149,15 @@ ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512F-NEXT: vpxor %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; AVX512F-NEXT: vpaddb %ymm7, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm7 -; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm3 +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm8 +; AVX512F-NEXT: vpmullw %ymm3, %ymm8, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] @@ -166,8 +167,8 @@ ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm7, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; @@ -185,14 +186,13 @@ ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxorq %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: retq %res = sdiv <64 x i8> %a, ret <64 x i8> %res @@ -495,11 +495,12 @@ ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; AVX512F-NEXT: vpaddb %ymm8, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] -; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512F-NEXT: vpand %ymm9, %ymm4, %ymm4 ; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 @@ -517,10 +518,10 @@ ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm8, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3 -; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm9, %ymm3, %ymm3 ; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -540,14 +541,13 @@ ; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxorq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpsubb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -1133,7 +1133,7 @@ ; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLCD-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLCD-NEXT: vzeroupper ; AVX512VLCD-NEXT: retq ; @@ -1142,7 +1142,7 @@ ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512CD-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; @@ -1326,7 +1326,7 @@ ; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLCD-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLCD-NEXT: vzeroupper ; AVX512VLCD-NEXT: retq ; @@ -1335,7 +1335,7 @@ ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512CD-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; @@ -1491,7 +1491,7 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1641,7 +1641,7 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -770,7 +770,7 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; X32-AVX-LABEL: testv16i16: @@ -887,7 +887,7 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; X32-AVX-LABEL: testv16i16u: @@ -980,12 +980,12 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232] +; AVX512-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; @@ -1074,12 +1074,12 @@ ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232] +; AVX512-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -335,12 +335,12 @@ ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520] +; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; @@ -350,12 +350,12 @@ ; AVX512CDBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CDBW-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CDBW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512CDBW-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512CDBW-NEXT: vmovdqa {{.*#+}} ymm2 = [65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520] +; AVX512CDBW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX512CDBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512CDBW-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512CDBW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512CDBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: retq ; @@ -421,12 +421,12 @@ ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520] +; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; @@ -436,12 +436,12 @@ ; AVX512CDBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CDBW-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CDBW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512CDBW-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512CDBW-NEXT: vmovdqa {{.*#+}} ymm2 = [65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520,65520] +; AVX512CDBW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX512CDBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512CDBW-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512CDBW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512CDBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: retq ; @@ -508,22 +508,22 @@ ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232] +; AVX512CD-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512CD-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512CD-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq @@ -535,22 +535,22 @@ ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CDBW-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CDBW-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512CDBW-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512CDBW-NEXT: vmovdqa {{.*#+}} xmm3 = [232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232] +; AVX512CDBW-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512CDBW-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX512CDBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512CDBW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CDBW-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CDBW-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512CDBW-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512CDBW-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX512CDBW-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX512CDBW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512CDBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: retq @@ -602,22 +602,22 @@ ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232] +; AVX512CD-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512CD-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512CD-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq @@ -629,22 +629,22 @@ ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CDBW-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CDBW-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512CDBW-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512CDBW-NEXT: vmovdqa {{.*#+}} xmm3 = [232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232] +; AVX512CDBW-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512CDBW-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX512CDBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512CDBW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CDBW-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CDBW-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512CDBW-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512CDBW-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX512CDBW-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX512CDBW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512CDBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -935,9 +935,8 @@ ; SSE2-NEXT: psrlq $1, %xmm1 ; SSE2-NEXT: psrlq $7, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] -; SSE2-NEXT: xorpd %xmm1, %xmm0 -; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: xorpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v2i64: @@ -946,9 +945,8 @@ ; SSE41-NEXT: psrlq $7, %xmm1 ; SSE41-NEXT: psrlq $1, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: psubq %xmm1, %xmm0 +; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i64: @@ -956,17 +954,15 @@ ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v2i64: @@ -1412,18 +1408,16 @@ ; SSE: # %bb.0: ; SSE-NEXT: psrlw $3, %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v16i8: ; AVX: # %bb.0: ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v16i8: @@ -1435,27 +1429,24 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v16i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: psrlw $3, %xmm0 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: pxor %xmm1, %xmm0 -; X32-SSE-NEXT: psubb %xmm1, %xmm0 +; X32-SSE-NEXT: pxor {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <16 x i8> %a, ret <16 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1012,28 +1012,25 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX1-LABEL: constant_shift_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2 -; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967296,2] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrlq $62, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $31, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4611686018427387904,72057594037927936] -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: constant_shift_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2] -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: constant_shift_v4i64: @@ -1047,9 +1044,8 @@ ; XOPAVX2-LABEL: constant_shift_v4i64: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2] -; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: constant_shift_v4i64: @@ -1585,11 +1581,12 @@ ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1597,9 +1594,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_shift_v32i8: @@ -1615,27 +1611,24 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; XOPAVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatconstant_shift_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; X32-AVX1-LABEL: splatconstant_shift_v32i8: @@ -1646,11 +1639,12 @@ ; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X32-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; X32-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X32-AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 ; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 ; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; X32-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; @@ -1658,9 +1652,8 @@ ; X32-AVX2: # %bb.0: ; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 ; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 -; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X32-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; X32-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpxor {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpaddb {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-AVX2-NEXT: retl %shift = ashr <32 x i8> %a, ret <32 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -364,11 +364,12 @@ ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512DQ-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512DQ-NEXT: vpaddb %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb %ymm4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; @@ -376,9 +377,8 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -2323,18 +2323,16 @@ ; SSE: # %bb.0: ; SSE-NEXT: psrlw $3, %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v8i8: ; AVX: # %bb.0: ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v8i8: @@ -2346,27 +2344,24 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v8i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: psrlw $3, %xmm0 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: pxor %xmm1, %xmm0 -; X32-SSE-NEXT: psubb %xmm1, %xmm0 +; X32-SSE-NEXT: pxor {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <8 x i8> %a, ret <8 x i8> %shift @@ -2377,18 +2372,16 @@ ; SSE: # %bb.0: ; SSE-NEXT: psrlw $3, %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v4i8: ; AVX: # %bb.0: ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v4i8: @@ -2400,27 +2393,24 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v4i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: psrlw $3, %xmm0 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: pxor %xmm1, %xmm0 -; X32-SSE-NEXT: psubb %xmm1, %xmm0 +; X32-SSE-NEXT: pxor {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <4 x i8> %a, ret <4 x i8> %shift @@ -2431,18 +2421,16 @@ ; SSE: # %bb.0: ; SSE-NEXT: psrlw $3, %xmm0 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatconstant_shift_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i8: @@ -2454,27 +2442,24 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; X32-SSE-LABEL: splatconstant_shift_v2i8: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: psrlw $3, %xmm0 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: pxor %xmm1, %xmm0 -; X32-SSE-NEXT: psubb %xmm1, %xmm0 +; X32-SSE-NEXT: pxor {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: retl %shift = ashr <2 x i8> %a, ret <2 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -1328,14 +1328,14 @@ ; SSE-LABEL: trunc_sub_const_v4i64_v4i32: ; SSE: # %bb.0: ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1343,7 +1343,7 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1351,7 +1351,7 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1359,7 +1359,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <4 x i64> %a0, @@ -1381,7 +1381,7 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: @@ -1394,7 +1394,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1407,7 +1407,7 @@ ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1419,14 +1419,14 @@ ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i64> %a0, @@ -1442,7 +1442,7 @@ ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: @@ -1452,7 +1452,7 @@ ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1460,7 +1460,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1468,7 +1468,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <8 x i32> %a0, @@ -1495,7 +1495,7 @@ ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: @@ -1516,7 +1516,7 @@ ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1541,7 +1541,7 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1563,7 +1563,7 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1573,7 +1573,7 @@ ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i64> %a0, @@ -1592,7 +1592,7 @@ ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: @@ -1605,7 +1605,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1620,14 +1620,14 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = sub <16 x i32> %a0, @@ -1642,7 +1642,7 @@ ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: @@ -1650,7 +1650,7 @@ ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1659,7 +1659,7 @@ ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1667,7 +1667,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1675,7 +1675,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1683,7 +1683,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %1 = sub <16 x i16> %a0, @@ -1694,12 +1694,12 @@ define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) { ; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %a = zext <16 x i8> %x to <16 x i16> %b = sub <16 x i16> %a, diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -442,36 +442,32 @@ ; X32-SSE2ONLY: # %bb.0: ; X32-SSE2ONLY-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE2ONLY-NEXT: psrlw $1, %xmm0 -; X32-SSE2ONLY-NEXT: movdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; X32-SSE2ONLY-NEXT: pxor %xmm1, %xmm0 -; X32-SSE2ONLY-NEXT: psubb %xmm1, %xmm0 +; X32-SSE2ONLY-NEXT: pxor {{\.LCPI.*}}, %xmm0 +; X32-SSE2ONLY-NEXT: paddb {{\.LCPI.*}}, %xmm0 ; X32-SSE2ONLY-NEXT: retl ; ; X32-SSE2AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1: ; X32-SSE2AVX: # %bb.0: ; X32-SSE2AVX-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-SSE2AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X32-SSE2AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; X32-SSE2AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X32-SSE2AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X32-SSE2AVX-NEXT: vpxor {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-SSE2AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-SSE2AVX-NEXT: retl ; ; X64-SSE2ONLY-LABEL: test_128_i8_x_16_224_mask_ashr_1: ; X64-SSE2ONLY: # %bb.0: ; X64-SSE2ONLY-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-SSE2ONLY-NEXT: psrlw $1, %xmm0 -; X64-SSE2ONLY-NEXT: movdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; X64-SSE2ONLY-NEXT: pxor %xmm1, %xmm0 -; X64-SSE2ONLY-NEXT: psubb %xmm1, %xmm0 +; X64-SSE2ONLY-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE2ONLY-NEXT: paddb {{.*}}(%rip), %xmm0 ; X64-SSE2ONLY-NEXT: retq ; ; X64-SSE2AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1: ; X64-SSE2AVX: # %bb.0: ; X64-SSE2AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-SSE2AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-SSE2AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; X64-SSE2AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-SSE2AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-SSE2AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; X64-SSE2AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; X64-SSE2AVX-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, @@ -482,36 +478,32 @@ ; X32-SSE2ONLY: # %bb.0: ; X32-SSE2ONLY-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE2ONLY-NEXT: psrlw $4, %xmm0 -; X32-SSE2ONLY-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X32-SSE2ONLY-NEXT: pxor %xmm1, %xmm0 -; X32-SSE2ONLY-NEXT: psubb %xmm1, %xmm0 +; X32-SSE2ONLY-NEXT: pxor {{\.LCPI.*}}, %xmm0 +; X32-SSE2ONLY-NEXT: paddb {{\.LCPI.*}}, %xmm0 ; X32-SSE2ONLY-NEXT: retl ; ; X32-SSE2AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4: ; X32-SSE2AVX: # %bb.0: ; X32-SSE2AVX-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-SSE2AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X32-SSE2AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X32-SSE2AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X32-SSE2AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X32-SSE2AVX-NEXT: vpxor {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-SSE2AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-SSE2AVX-NEXT: retl ; ; X64-SSE2ONLY-LABEL: test_128_i8_x_16_224_mask_ashr_4: ; X64-SSE2ONLY: # %bb.0: ; X64-SSE2ONLY-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-SSE2ONLY-NEXT: psrlw $4, %xmm0 -; X64-SSE2ONLY-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X64-SSE2ONLY-NEXT: pxor %xmm1, %xmm0 -; X64-SSE2ONLY-NEXT: psubb %xmm1, %xmm0 +; X64-SSE2ONLY-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE2ONLY-NEXT: paddb {{.*}}(%rip), %xmm0 ; X64-SSE2ONLY-NEXT: retq ; ; X64-SSE2AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4: ; X64-SSE2AVX: # %bb.0: ; X64-SSE2AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-SSE2AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X64-SSE2AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X64-SSE2AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-SSE2AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-SSE2AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; X64-SSE2AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; X64-SSE2AVX-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, @@ -522,36 +514,32 @@ ; X32-SSE2ONLY: # %bb.0: ; X32-SSE2ONLY-NEXT: psrlw $5, %xmm0 ; X32-SSE2ONLY-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE2ONLY-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; X32-SSE2ONLY-NEXT: pxor %xmm1, %xmm0 -; X32-SSE2ONLY-NEXT: psubb %xmm1, %xmm0 +; X32-SSE2ONLY-NEXT: pxor {{\.LCPI.*}}, %xmm0 +; X32-SSE2ONLY-NEXT: paddb {{\.LCPI.*}}, %xmm0 ; X32-SSE2ONLY-NEXT: retl ; ; X32-SSE2AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5: ; X32-SSE2AVX: # %bb.0: ; X32-SSE2AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 ; X32-SSE2AVX-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-SSE2AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; X32-SSE2AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X32-SSE2AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X32-SSE2AVX-NEXT: vpxor {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-SSE2AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-SSE2AVX-NEXT: retl ; ; X64-SSE2ONLY-LABEL: test_128_i8_x_16_224_mask_ashr_5: ; X64-SSE2ONLY: # %bb.0: ; X64-SSE2ONLY-NEXT: psrlw $5, %xmm0 ; X64-SSE2ONLY-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-SSE2ONLY-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; X64-SSE2ONLY-NEXT: pxor %xmm1, %xmm0 -; X64-SSE2ONLY-NEXT: psubb %xmm1, %xmm0 +; X64-SSE2ONLY-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE2ONLY-NEXT: paddb {{.*}}(%rip), %xmm0 ; X64-SSE2ONLY-NEXT: retq ; ; X64-SSE2AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5: ; X64-SSE2AVX: # %bb.0: ; X64-SSE2AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 ; X64-SSE2AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-SSE2AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; X64-SSE2AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-SSE2AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-SSE2AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; X64-SSE2AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; X64-SSE2AVX-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, @@ -562,36 +550,32 @@ ; X32-SSE2ONLY: # %bb.0: ; X32-SSE2ONLY-NEXT: psrlw $6, %xmm0 ; X32-SSE2ONLY-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE2ONLY-NEXT: movdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X32-SSE2ONLY-NEXT: pxor %xmm1, %xmm0 -; X32-SSE2ONLY-NEXT: psubb %xmm1, %xmm0 +; X32-SSE2ONLY-NEXT: pxor {{\.LCPI.*}}, %xmm0 +; X32-SSE2ONLY-NEXT: paddb {{\.LCPI.*}}, %xmm0 ; X32-SSE2ONLY-NEXT: retl ; ; X32-SSE2AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6: ; X32-SSE2AVX: # %bb.0: ; X32-SSE2AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 ; X32-SSE2AVX-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-SSE2AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X32-SSE2AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X32-SSE2AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X32-SSE2AVX-NEXT: vpxor {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-SSE2AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm0, %xmm0 ; X32-SSE2AVX-NEXT: retl ; ; X64-SSE2ONLY-LABEL: test_128_i8_x_16_224_mask_ashr_6: ; X64-SSE2ONLY: # %bb.0: ; X64-SSE2ONLY-NEXT: psrlw $6, %xmm0 ; X64-SSE2ONLY-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-SSE2ONLY-NEXT: movdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X64-SSE2ONLY-NEXT: pxor %xmm1, %xmm0 -; X64-SSE2ONLY-NEXT: psubb %xmm1, %xmm0 +; X64-SSE2ONLY-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-SSE2ONLY-NEXT: paddb {{.*}}(%rip), %xmm0 ; X64-SSE2ONLY-NEXT: retq ; ; X64-SSE2AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6: ; X64-SSE2AVX: # %bb.0: ; X64-SSE2AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 ; X64-SSE2AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-SSE2AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X64-SSE2AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-SSE2AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-SSE2AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; X64-SSE2AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 ; X64-SSE2AVX-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, diff --git a/llvm/test/CodeGen/X86/widen_arith-4.ll b/llvm/test/CodeGen/X86/widen_arith-4.ll --- a/llvm/test/CodeGen/X86/widen_arith-4.ll +++ b/llvm/test/CodeGen/X86/widen_arith-4.ll @@ -14,8 +14,8 @@ ; SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movw $0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movl $0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u> -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u> +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = <2,4,2,2,2,u,u,u> +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = <64994,64452,64994,64994,64994,u,u,u> ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB0_1: # %forcond ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 @@ -29,8 +29,8 @@ ; SSE2-NEXT: shlq $4, %rax ; SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: movdqa (%rdx,%rax), %xmm2 -; SSE2-NEXT: psubw %xmm0, %xmm2 -; SSE2-NEXT: pmullw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm0, %xmm2 +; SSE2-NEXT: paddw %xmm1, %xmm2 ; SSE2-NEXT: movq %xmm2, (%rcx,%rax) ; SSE2-NEXT: pextrw $4, %xmm2, %edx ; SSE2-NEXT: movw %dx, 8(%rcx,%rax) @@ -48,7 +48,7 @@ ; SSE41-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: movw $0, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: movl $0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u> +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <64994,64452,64994,64994,64994,u,u,u> ; SSE41-NEXT: .p2align 4, 0x90 ; SSE41-NEXT: .LBB0_1: # %forcond ; SSE41-NEXT: # =>This Inner Loop Header: Depth=1 @@ -62,13 +62,13 @@ ; SSE41-NEXT: shlq $4, %rax ; SSE41-NEXT: movq -{{[0-9]+}}(%rsp), %rdx ; SSE41-NEXT: movdqa (%rdx,%rax), %xmm1 -; SSE41-NEXT: psubw %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psllw $2, %xmm2 ; SSE41-NEXT: psllw $1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax) -; SSE41-NEXT: movq %xmm2, (%rcx,%rax) +; SSE41-NEXT: movq %xmm1, (%rcx,%rax) ; SSE41-NEXT: incl -{{[0-9]+}}(%rsp) ; SSE41-NEXT: jmp .LBB0_1 ; SSE41-NEXT: .LBB0_3: # %afterfor diff --git a/llvm/test/CodeGen/X86/widen_arith-5.ll b/llvm/test/CodeGen/X86/widen_arith-5.ll --- a/llvm/test/CodeGen/X86/widen_arith-5.ll +++ b/llvm/test/CodeGen/X86/widen_arith-5.ll @@ -13,7 +13,7 @@ ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <3,3,3,u> +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4294967293,4294967293,4294967293,u> ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %forcond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -28,7 +28,7 @@ ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rdx ; CHECK-NEXT: movdqa (%rdx,%rax), %xmm1 ; CHECK-NEXT: pslld $2, %xmm1 -; CHECK-NEXT: psubd %xmm0, %xmm1 +; CHECK-NEXT: paddd %xmm0, %xmm1 ; CHECK-NEXT: pextrd $2, %xmm1, 8(%rcx,%rax) ; CHECK-NEXT: movq %xmm1, (%rcx,%rax) ; CHECK-NEXT: incl -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/widen_cast-4.ll b/llvm/test/CodeGen/X86/widen_cast-4.ll --- a/llvm/test/CodeGen/X86/widen_cast-4.ll +++ b/llvm/test/CodeGen/X86/widen_cast-4.ll @@ -11,6 +11,7 @@ ; WIDE-NEXT: pcmpeqd %xmm0, %xmm0 ; WIDE-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; WIDE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; WIDE-NEXT: movdqa {{.*#+}} xmm3 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] ; WIDE-NEXT: .p2align 4, 0x90 ; WIDE-NEXT: .LBB0_1: # %forcond ; WIDE-NEXT: # =>This Inner Loop Header: Depth=1 @@ -26,13 +27,13 @@ ; WIDE-NEXT: movl %edx, {{[0-9]+}}(%esp) ; WIDE-NEXT: addl {{[0-9]+}}(%esp), %ecx ; WIDE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; WIDE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; WIDE-NEXT: psubb %xmm0, %xmm3 -; WIDE-NEXT: psrlw $2, %xmm3 -; WIDE-NEXT: pand %xmm1, %xmm3 -; WIDE-NEXT: pxor %xmm2, %xmm3 -; WIDE-NEXT: psubb %xmm2, %xmm3 -; WIDE-NEXT: movq %xmm3, (%edx,%eax,8) +; WIDE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; WIDE-NEXT: psubb %xmm0, %xmm4 +; WIDE-NEXT: psrlw $2, %xmm4 +; WIDE-NEXT: pand %xmm1, %xmm4 +; WIDE-NEXT: pxor %xmm2, %xmm4 +; WIDE-NEXT: paddb %xmm3, %xmm4 +; WIDE-NEXT: movq %xmm4, (%edx,%eax,8) ; WIDE-NEXT: incl (%esp) ; WIDE-NEXT: jmp .LBB0_1 ; WIDE-NEXT: .LBB0_3: # %afterfor diff --git a/llvm/test/CodeGen/X86/x86-shifts.ll b/llvm/test/CodeGen/X86/x86-shifts.ll --- a/llvm/test/CodeGen/X86/x86-shifts.ll +++ b/llvm/test/CodeGen/X86/x86-shifts.ll @@ -351,18 +351,16 @@ ; X32: # %bb.0: ; X32-NEXT: psrlw $3, %xmm0 ; X32-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X32-NEXT: pxor %xmm1, %xmm0 -; X32-NEXT: psubb %xmm1, %xmm0 +; X32-NEXT: pxor {{\.LCPI.*}}, %xmm0 +; X32-NEXT: paddb {{\.LCPI.*}}, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: sra_v16i8: ; X64: # %bb.0: ; X64-NEXT: psrlw $3, %xmm0 ; X64-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X64-NEXT: pxor %xmm1, %xmm0 -; X64-NEXT: psubb %xmm1, %xmm0 +; X64-NEXT: pxor {{.*}}(%rip), %xmm0 +; X64-NEXT: paddb {{.*}}(%rip), %xmm0 ; X64-NEXT: retq %B = ashr <16 x i8> %A, ret <16 x i8> %B