Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5833,9 +5833,46 @@ return SDValue(); } +static Constant* getConstantVector(MVT VT, APInt SplatValue, + unsigned SplatBitSize, LLVMContext &C) { + unsigned ScalarSize = VT.getScalarSizeInBits(); + unsigned NumElm = SplatBitSize/ScalarSize; + + SmallVector ConstantVec; + for (unsigned i = 0; i < NumElm; i++) { + APInt Val = APInt(SplatValue); + Val = Val.lshr(ScalarSize * i).trunc(ScalarSize); + Constant* Const; + if (VT.isFloatingPoint()) { + assert((ScalarSize == 32 || ScalarSize == 64) && + "Unsupported floating point scalar size"); + if (ScalarSize == 32) + Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat()); + else + Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble()); + } + else + Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); + ConstantVec.push_back(Const); + } + return ConstantVector::get(ArrayRef(ConstantVec)); +} + +static bool isUseOfShuffleThroughBitcast(SDNode* N) { + for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { + if (isTargetShuffle((*I)->getOpcode())) + return true; + if ((*I)->getOpcode() == ISD::BITCAST) + return isUseOfShuffleThroughBitcast(*I); + } + return false; +} + /// Attempt to use the vbroadcast instruction to generate a splat value for the /// following cases: -/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. +/// 1. A splat BUILD_VECTOR which uses: +/// a. A single scalar load, or a constant. +/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). /// 2. A splat shuffle which uses a scalar_to_vector node which comes from /// a scalar load, or a constant. /// The VBROADCAST node is returned when a pattern is found, @@ -5869,8 +5906,58 @@ // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. - if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) + if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { + APInt SplatValue, Undef; + unsigned SplatBitSize; + bool HasUndef; + // Check if this is a repeated constant pattern suitable for broadcasting. + if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && + SplatBitSize > VT.getScalarSizeInBits() && + SplatBitSize < VT.getSizeInBits()) { + // Avoid replacing with broadcast when its an use of a shuffle + // instruction to preserve the present custom lowering of shuffles. + if (isUseOfShuffleThroughBitcast(BVOp) || BVOp->hasOneUse()) + return SDValue(); + // replace BUILD_VECTOR with broadcast of the repeated constants. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); + if (Subtarget.hasAVX2()) { + if (SplatBitSize <= 32 || (Subtarget.is64Bit() && SplatBitSize <= 64)) { + // Splatted value can fit in one constant in constant pool. + // Load the constant and broadcast it. + MVT CVT = MVT::getIntegerVT(SplatBitSize); + Type* ScalarTy = Type::getIntNTy(*DAG.getContext(), SplatBitSize); + Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue); + SDValue CP = DAG.getConstantPool(C, PVT); + unsigned Repeat = VT.getSizeInBits()/SplatBitSize; + + unsigned Alignment = cast(CP)->getAlignment(); + Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + Alignment); + SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, + MVT::getVectorVT(CVT, Repeat), Ld); + return DAG.getBitcast(VT, Brdcst); + } else if (SplatBitSize != 64) { // No support for 64bit vectors. + // Load the vector of constants and broadcast it. + MVT CVT = VT.getScalarType(); + Constant* VecC = getConstantVector(VT, SplatValue, SplatBitSize, + *DAG.getContext()); + SDValue VCP = DAG.getConstantPool(VecC, PVT); + unsigned NumElm = SplatBitSize/VT.getScalarSizeInBits(); + unsigned Alignment = cast(VCP)->getAlignment(); + Ld = DAG.getLoad(MVT::getVectorVT(CVT, NumElm), dl, + DAG.getEntryNode(), VCP, + MachinePointerInfo::getConstantPool( + DAG.getMachineFunction()), + Alignment); + SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld); + return DAG.getBitcast(VT, Brdcst); + } + } + } return SDValue(); + } Ld = Splat; ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -2132,7 +2132,7 @@ ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -2405,7 +2405,7 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 Index: test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll =================================================================== --- test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -0,0 +1,688 @@ +; NOTE: Assertions have been simpilfied MANUALLY after running utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=AVX512BW -check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX2 -check-prefix=AVX2-64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX512 -check-prefix=AVX512F-64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=AVX512BW -check-prefix=AVX512 -check-prefix=AVX512BW-64 + +;===-----------------------------------------------------------------------------=== +; This test checks the ability to recognize a cross element pattern of +; constants and perform the load via broadcasting a smaller constant +; vector. +; For example: +; => broadcast of the constant vector +;===-----------------------------------------------------------------------------=== + +define <16 x i8> @f16xi8_i16(<16 x i8> %a) { +; ALL32-LABEL: f16xi8_i16: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f16xi8_i16: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq + %res1 = add <16 x i8> , %a + %res2 = and <16 x i8> , %res1 + ret <16 x i8> %res2 +} + +define <16 x i8> @f16xi8_i32(<16 x i8> %a) { +; ALL32-LABEL: f16xi8_i32: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f16xi8_i32: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq + %res1 = add <16 x i8> , %a + %res2 = and <16 x i8> , %res1 + ret <16 x i8> %res2 +} + +define <16 x i8> @f16xi8_i64(<16 x i8> %a) { +; ALL32-LABEL: f16xi8_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f16xi8_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq + %res1 = add <16 x i8> , %a + %res2 = and <16 x i8> , %res1 + ret <16 x i8> %res2 +} + +define <32 x i8> @f32xi8_i16(<32 x i8> %a) { +; ALL32-LABEL: f32xi8_i16: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f32xi8_i16: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <32 x i8> , %a + %res2 = and <32 x i8> , %res1 + ret <32 x i8> %res2 +} + +define <32 x i8> @f32xi8_i32(<32 x i8> %a) { +; ALL32-LABEL: f32xi8_i32: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f32xi8_i32: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <32 x i8> , %a + %res2 = and <32 x i8> , %res1 + ret <32 x i8> %res2 +} + +define <32 x i8> @f32xi8_i64(<32 x i8> %a) { +; ALL32-LABEL: f32xi8_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f32xi8_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <32 x i8> , %a + %res2 = and <32 x i8> , %res1 + ret <32 x i8> %res2 +} + +define <32 x i8> @f32xi8_i128(<32 x i8> %a) { +; ALL32-LABEL: f32xi8_i128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f32xi8_i128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <32 x i8> , %a + %res2 = and <32 x i8> , %res1 + ret <32 x i8> %res2 +} + +define <64 x i8> @f64xi8_i16(<64 x i8> %a) { +; NO-AVX512BW-LABEL: f64xi8_i16: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f64xi8_i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + +define <64 x i8> @f64i8_i32(<64 x i8> %a) { +; NO-AVX512BW-LABEL: f64i8_i32: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f64i8_i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + +define <64 x i8> @f64xi8_i64(<64 x i8> %a) { +; AVX2-64-LABEL: f64xi8_i64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f64xi8_i64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX512F-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512F-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512F-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64xi8_i64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + +define <64 x i8> @f64xi8_i128(<64 x i8> %a) { +; NO-AVX512BW-LABEL: f64xi8_i128: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f64xi8_i128: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + +define <64 x i8> @f64xi8_i256(<64 x i8> %a) { +; AVX512BW-LABEL: f64xi8_i256: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + +define <8 x i16> @f8xi16_i32(<8 x i16> %a) { +; ALL32-LABEL: f8xi16_i32: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f8xi16_i32: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq + %res1 = add <8 x i16> , %a + %res2 = and <8 x i16> , %res1 + ret <8 x i16> %res2 +} + +define <8 x i16> @f8xi16_i64(<8 x i16> %a) { +; ALL32-LABEL: f8xi16_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] +; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f8xi16_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq + %res1 = add <8 x i16> , %a + %res2 = and <8 x i16> , %res1 + ret <8 x i16> %res2 +} + +define <16 x i16> @f16xi16_i32(<16 x i16> %a) { +; ALL32-LABEL: f16xi16_i32: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f16xi16_i32: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <16 x i16> , %a + %res2 = and <16 x i16> , %res1 + ret <16 x i16> %res2 +} + +define <16 x i16> @f16xi16_i64(<16 x i16> %a) { +; ALL32-LABEL: f16xi16_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f16xi16_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <16 x i16> , %a + %res2 = and <16 x i16> , %res1 + ret <16 x i16> %res2 +} + +define <16 x i16> @f16xi16_i128(<16 x i16> %a) { +; ALL-LABEL: f16xi16_i128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 + %res1 = add <16 x i16> , %a + %res2 = and <16 x i16> , %res1 + ret <16 x i16> %res2 +} + +define <32 x i16> @f32xi16_i32(<32 x i16> %a) { +; NO-AVX512BW-LABEL: f32xi16_i32: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f32xi16_i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <32 x i16> , %a + %res2 = and <32 x i16> , %res1 + ret <32 x i16> %res2 +} + +define <32 x i16> @f32xi16_i64(<32 x i16> %a) { +; AVX2-64-LABEL: f32xi16_i64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f32xi16_i64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX512F-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512F-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512F-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f32xi16_i64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <32 x i16> , %a + %res2 = and <32 x i16> , %res1 + ret <32 x i16> %res2 +} + +define <32 x i16> @f32xi16_i128(<32 x i16> %a) { +; NO-AVX512BW-LABEL: f32xi16_i128: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f32xi16_i128: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <32 x i16> , %a + %res2 = and <32 x i16> , %res1 + ret <32 x i16> %res2 +} + +define <32 x i16> @f32xi16_i256(<32 x i16> %a) { +; AVX512BW-LABEL: f32xi16_i256: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <32 x i16> , %a + %res2 = and <32 x i16> , %res1 + ret <32 x i16> %res2 +} + +define <4 x i32> @f4xi32_i64(<4 x i32> %a) { +; ALL64-LABEL: f4xi32_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq + %res1 = add <4 x i32> , %a + %res2 = and <4 x i32> , %res1 + ret <4 x i32> %res2 +} + +define <8 x i32> @f8xi32_i64(<8 x i32> %a) { +; ALL64-LABEL: f8xi32_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <8 x i32> , %a + %res2 = and <8 x i32> , %res1 + ret <8 x i32> %res2 +} + +define <8 x i32> @f8xi32_i128(<8 x i32> %a) { +; ALL-LABEL: f8xi32_i128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 + %res1 = add <8 x i32> , %a + %res2 = and <8 x i32> , %res1 + ret <8 x i32> %res2 +} + +define <16 x i32> @f16xi32_i64(<16 x i32> %a) { +; AVX2-64-LABEL: f16xi32_i64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xi32_i64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f16xi32_i64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 +; AVX512BW-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <16 x i32> , %a + %res2 = and <16 x i32> , %res1 + ret <16 x i32> %res2 +} + +define <16 x i32> @f16xi32_i128(<16 x i32> %a) { +; AVX2-LABEL: f16xi32_i128: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512-LABEL: f16xi32_i128: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 + %res1 = add <16 x i32> , %a + %res2 = and <16 x i32> , %res1 + ret <16 x i32> %res2 +} + +define <4 x i64> @f4xi64_i128(<4 x i64> %a) { +; ALL64-LABEL: f4xi64_i128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <4 x i64> , %a + %res2 = and <4 x i64> , %res1 + ret <4 x i64> %res2 +} + +define <8 x i64> @f8xi64_i128(<8 x i64> %a) { +; AVX2-64-LABEL: f8xi64_i128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xi64_i128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f8xi64_i128: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <8 x i64> , %a + %res2 = and <8 x i64> , %res1 + ret <8 x i64> %res2 +} + +define <8 x i64> @f8xi64_i256(<8 x i64> %a) { +; AVX512F-64-LABEL: f8xi64_i256: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f8xi64_i256: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <8 x i64> , %a + %res2 = and <8 x i64> , %res1 + ret <8 x i64> %res2 +} + +define <4 x float> @f4xf32_f64(<4 x float> %a) { +; ALL64-LABEL: f4xf32_f64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; ALL64-NEXT: retq + %res1 = fadd <4 x float> , %a + %res2 = fdiv <4 x float> , %res1 + ret <4 x float> %res2 +} + +define <8 x float> @f8xf32_f64(<8 x float> %a) { +; ALL64-LABEL: f8xf32_f64: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; ALL64-NEXT: retq + %res1 = fadd <8 x float> , %a + %res2 = fdiv <8 x float> , %res1 + ret <8 x float> %res2 +} + +define <8 x float> @f8xf32_f128(<8 x float> %a) { +; ALL-LABEL: f8xf32_f128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0 + %res1 = fadd <8 x float> , %a + %res2 = fdiv <8 x float> , %res1 + ret <8 x float> %res2 +} + +define <16 x float> @f16xf32_f64(<16 x float> %a) { +; AVX2-64-LABEL: f16xf32_f64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2 +; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xf32_f64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastsd {{.*}}(%rip), %zmm1 +; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f16xf32_f64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcastsd {{.*}}(%rip), %zmm1 +; AVX512BW-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = fadd <16 x float> , %a + %res2 = fdiv <16 x float> , %res1 + ret <16 x float> %res2 +} + +define <16 x float> @f16xf32_f128(<16 x float> %a) { +; AVX2-LABEL: f16xf32_f128: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; +; AVX512-LABEL: f16xf32_f128: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 + %res1 = fadd <16 x float> , %a + %res2 = fdiv <16 x float> , %res1 + ret <16 x float> %res2 +} + +define <16 x float> @f16xf32_f256(<16 x float> %a) { +; AVX512-LABEL: f16xf32_f256: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 + %res1 = fadd <16 x float> , %a + %res2 = fdiv <16 x float> , %res1 + ret <16 x float> %res2 +} + +define <4 x double> @f4xf64_f128(<4 x double> %a) { +; ALL-LABEL: f4xf64_f128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vdivpd %ymm0, %ymm1, %ymm0 + %res1 = fadd <4 x double> , %a + %res2 = fdiv <4 x double> , %res1 + ret <4 x double> %res2 +} + +define <8 x double> @f8xf64_f128(<8 x double> %a) { +; AVX2-LABEL: f8xf64_f128: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; +; AVX512-LABEL: f8xf64_f128: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 + %res1 = fadd <8 x double> , %a + %res2 = fdiv <8 x double> , %res1 + ret <8 x double> %res2 +} + +define <8 x double> @f8xf64_f256(<8 x double> %a) { +; AVX512-LABEL: f8xf64_f256: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 + %res1 = fadd <8 x double> , %a + %res2 = fdiv <8 x double> , %res1 + ret <8 x double> %res2 +} + Index: test/CodeGen/X86/vec_shift6.ll =================================================================== --- test/CodeGen/X86/vec_shift6.ll +++ test/CodeGen/X86/vec_shift6.ll @@ -153,14 +153,14 @@ ; ; AVX2-LABEL: test7: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test7: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: retq @@ -183,7 +183,7 @@ ; ; AVX2-LABEL: test8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq