Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5833,9 +5833,46 @@ return SDValue(); } +static Constant* getConstantVector(MVT VT, APInt SplatValue, + unsigned SplatBitSize, LLVMContext &C) { + unsigned ScalarSize = VT.getScalarSizeInBits(); + unsigned NumElm = SplatBitSize/ScalarSize; + + SmallVector ConstantVec; + for (unsigned i = 0; i < NumElm; i++) { + APInt Val = APInt(SplatValue); + Val = Val.lshr(ScalarSize * i).trunc(ScalarSize); + Constant* Const; + if (VT.isFloatingPoint()) { + assert((ScalarSize == 32 || ScalarSize == 64) && + "Unsupported floating point scalar size"); + if (ScalarSize == 32) + Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat()); + else + Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble()); + } + else + Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); + ConstantVec.push_back(Const); + } + return ConstantVector::get(ArrayRef(ConstantVec)); +} + +static bool isUseOfShuffleThroughBitcast(SDNode* N) { + for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { + if (isTargetShuffle((*I)->getOpcode())) + return true; + if ((*I)->getOpcode() == ISD::BITCAST) + return isUseOfShuffleThroughBitcast(*I); + } + return false; +} + /// Attempt to use the vbroadcast instruction to generate a splat value for the /// following cases: -/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. +/// 1. A splat BUILD_VECTOR which uses: +/// a. A single scalar load, or a constant. +/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). /// 2. A splat shuffle which uses a scalar_to_vector node which comes from /// a scalar load, or a constant. /// The VBROADCAST node is returned when a pattern is found, @@ -5869,8 +5906,58 @@ // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. - if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) + if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { + APInt SplatValue, Undef; + unsigned SplatBitSize; + bool HasUndef; + // Check if this is a repeated constant pattern suitable for broadcasting. + if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && + SplatBitSize > VT.getScalarSizeInBits() && + SplatBitSize < VT.getSizeInBits()) { + // Avoid replacing with broadcast when its an use of a shuffle + // instruction to preserve the present custom lowering of shuffles. + if (isUseOfShuffleThroughBitcast(BVOp)) + return SDValue(); + // replace BUILD_VECTOR with broadcast of the repeated constants. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); + if (Subtarget.hasAVX2()) { + if (SplatBitSize <= 32 || (Subtarget.is64Bit() && SplatBitSize <= 64)) { + // Splatted value can fit in one constant in constant pool. + // Load the constant and broadcast it. + MVT CVT = MVT::getIntegerVT(SplatBitSize); + Type* ScalarTy = Type::getIntNTy(*DAG.getContext(), SplatBitSize); + Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue); + SDValue CP = DAG.getConstantPool(C, PVT); + unsigned Repeat = VT.getSizeInBits()/SplatBitSize; + + unsigned Alignment = cast(CP)->getAlignment(); + Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + Alignment); + SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, + MVT::getVectorVT(CVT, Repeat), Ld); + return DAG.getBitcast(VT, Brdcst); + } else if (SplatBitSize != 64) { // No support for 64bit vectors. + // Load the vector of constants and broadcast it. + MVT CVT = VT.getScalarType(); + Constant* VecC = getConstantVector(VT, SplatValue, SplatBitSize, + *DAG.getContext()); + SDValue VCP = DAG.getConstantPool(VecC, PVT); + unsigned NumElm = SplatBitSize/VT.getScalarSizeInBits(); + unsigned Alignment = cast(VCP)->getAlignment(); + Ld = DAG.getLoad(MVT::getVectorVT(CVT, NumElm), dl, + DAG.getEntryNode(), VCP, + MachinePointerInfo::getConstantPool( + DAG.getMachineFunction()), + Alignment); + SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld); + return DAG.getBitcast(VT, Brdcst); + } + } + } return SDValue(); + } Ld = Splat; ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -1751,21 +1751,24 @@ ; AVX2-LABEL: avg_v4i8_const: ; AVX2: # BB#0: ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, (%rax) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v4i8_const: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rax) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v4i8_const: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rax) ; AVX512BW-NEXT: retq %1 = load <4 x i8>, <4 x i8>* %a @@ -1788,21 +1791,24 @@ ; AVX2-LABEL: avg_v8i8_const: ; AVX2: # BB#0: ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v8i8_const: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rax) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v8i8_const: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; AVX512BW-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: retq %1 = load <8 x i8>, <8 x i8>* %a @@ -1824,22 +1830,22 @@ ; ; AVX2-LABEL: avg_v16i8_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 +; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v16i8_const: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 +; AVX512F-NEXT: vpavgb (%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqu %xmm0, (%rax) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v16i8_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 +; AVX512BW-NEXT: vpavgb (%rdi), %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) ; AVX512BW-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a @@ -1914,23 +1920,23 @@ ; ; AVX2-LABEL: avg_v32i8_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i8_const: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 +; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i8_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 +; AVX512BW-NEXT: vpavgb (%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) ; AVX512BW-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %a @@ -2132,7 +2138,7 @@ ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -2153,8 +2159,8 @@ ; ; AVX512BW-LABEL: avg_v64i8_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 +; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) ; AVX512BW-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a @@ -2177,21 +2183,24 @@ ; AVX2-LABEL: avg_v4i16_const: ; AVX2: # BB#0: ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v4i16_const: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rax) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v4i16_const: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; AVX512BW-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rax) ; AVX512BW-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a @@ -2278,23 +2287,23 @@ ; ; AVX2-LABEL: avg_v16i16_const: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v16i16_const: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v16i16_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX512BW-NEXT: vpavgw (%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) ; AVX512BW-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a @@ -2405,7 +2414,7 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 @@ -2416,8 +2425,8 @@ ; ; AVX512BW-LABEL: avg_v32i16_const: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) ; AVX512BW-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %a Index: test/CodeGen/X86/avx-logic.ll =================================================================== --- test/CodeGen/X86/avx-logic.ll +++ test/CodeGen/X86/avx-logic.ll @@ -1,14 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s -check-prefix=AVX512F define <4 x double> @andpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp { -; CHECK-LABEL: andpd256: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandpd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: andpd256: +; AVX: # BB#0: # %entry +; AVX-NEXT: vandpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: andpd256: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vandpd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <4 x double> %x to <4 x i64> %1 = bitcast <4 x double> %y to <4 x i64> @@ -20,12 +27,19 @@ } define <4 x double> @andpd256fold(<4 x double> %y) nounwind uwtable readnone ssp { -; CHECK-LABEL: andpd256fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: andpd256fold: +; AVX: # BB#0: # %entry +; AVX-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: andpd256fold: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <4 x double> %y to <4 x i64> %and.i = and <4 x i64> %0, @@ -36,10 +50,15 @@ } define <8 x float> @andps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { -; CHECK-LABEL: andps256: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: andps256: +; AVX: # BB#0: # %entry +; AVX-NEXT: vandps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: andps256: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vandps %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <8 x float> %x to <8 x i32> %1 = bitcast <8 x float> %y to <8 x i32> @@ -49,10 +68,16 @@ } define <8 x float> @andps256fold(<8 x float> %y) nounwind uwtable readnone ssp { -; CHECK-LABEL: andps256fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: andps256fold: +; AVX: # BB#0: # %entry +; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: andps256fold: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <8 x float> %y to <8 x i32> %and.i = and <8 x i32> %0, @@ -61,12 +86,19 @@ } define <4 x double> @xorpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp { -; CHECK-LABEL: xorpd256: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vxorpd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: xorpd256: +; AVX: # BB#0: # %entry +; AVX-NEXT: vxorpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: xorpd256: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vxorpd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <4 x double> %x to <4 x i64> %1 = bitcast <4 x double> %y to <4 x i64> @@ -78,12 +110,19 @@ } define <4 x double> @xorpd256fold(<4 x double> %y) nounwind uwtable readnone ssp { -; CHECK-LABEL: xorpd256fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vxorpd {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: xorpd256fold: +; AVX: # BB#0: # %entry +; AVX-NEXT: vxorpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: xorpd256fold: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vxorpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <4 x double> %y to <4 x i64> %xor.i = xor <4 x i64> %0, @@ -94,10 +133,15 @@ } define <8 x float> @xorps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { -; CHECK-LABEL: xorps256: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vxorps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: xorps256: +; AVX: # BB#0: # %entry +; AVX-NEXT: vxorps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: xorps256: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vxorps %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <8 x float> %x to <8 x i32> %1 = bitcast <8 x float> %y to <8 x i32> @@ -107,10 +151,16 @@ } define <8 x float> @xorps256fold(<8 x float> %y) nounwind uwtable readnone ssp { -; CHECK-LABEL: xorps256fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: xorps256fold: +; AVX: # BB#0: # %entry +; AVX-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: xorps256fold: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <8 x float> %y to <8 x i32> %xor.i = xor <8 x i32> %0, @@ -119,12 +169,19 @@ } define <4 x double> @orpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp { -; CHECK-LABEL: orpd256: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vorpd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: orpd256: +; AVX: # BB#0: # %entry +; AVX-NEXT: vorpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: orpd256: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vorpd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <4 x double> %x to <4 x i64> %1 = bitcast <4 x double> %y to <4 x i64> @@ -136,12 +193,19 @@ } define <4 x double> @orpd256fold(<4 x double> %y) nounwind uwtable readnone ssp { -; CHECK-LABEL: orpd256fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: orpd256fold: +; AVX: # BB#0: # %entry +; AVX-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: orpd256fold: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <4 x double> %y to <4 x i64> %or.i = or <4 x i64> %0, @@ -152,10 +216,15 @@ } define <8 x float> @orps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { -; CHECK-LABEL: orps256: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vorps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: orps256: +; AVX: # BB#0: # %entry +; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: orps256: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <8 x float> %x to <8 x i32> %1 = bitcast <8 x float> %y to <8 x i32> @@ -165,10 +234,16 @@ } define <8 x float> @orps256fold(<8 x float> %y) nounwind uwtable readnone ssp { -; CHECK-LABEL: orps256fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: orps256fold: +; AVX: # BB#0: # %entry +; AVX-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: orps256fold: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <8 x float> %y to <8 x i32> %or.i = or <8 x i32> %0, @@ -177,12 +252,19 @@ } define <4 x double> @andnotpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp { -; CHECK-LABEL: andnotpd256: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandnpd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: andnotpd256: +; AVX: # BB#0: # %entry +; AVX-NEXT: vandnpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: andnotpd256: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vandnpd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <4 x double> %x to <4 x i64> %neg.i = xor <4 x i64> %0, @@ -195,12 +277,19 @@ } define <4 x double> @andnotpd256fold(<4 x double> %y, <4 x double>* nocapture %x) nounwind uwtable readonly ssp { -; CHECK-LABEL: andnotpd256fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandnpd (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: andnotpd256fold: +; AVX: # BB#0: # %entry +; AVX-NEXT: vandnpd (%rdi), %ymm0, %ymm0 +; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: andnotpd256fold: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vandnpd (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %tmp2 = load <4 x double>, <4 x double>* %x, align 32 %0 = bitcast <4 x double> %y to <4 x i64> @@ -214,10 +303,15 @@ } define <8 x float> @andnotps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { -; CHECK-LABEL: andnotps256: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: andnotps256: +; AVX: # BB#0: # %entry +; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: andnotps256: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq entry: %0 = bitcast <8 x float> %x to <8 x i32> %neg.i = xor <8 x i32> %0, @@ -228,10 +322,15 @@ } define <8 x float> @andnotps256fold(<8 x float> %y, <8 x float>* nocapture %x) nounwind uwtable readonly ssp { -; CHECK-LABEL: andnotps256fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX-LABEL: andnotps256fold: +; AVX: # BB#0: # %entry +; AVX-NEXT: vandnps (%rdi), %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: andnotps256fold: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vandnps (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: retq entry: %tmp2 = load <8 x float>, <8 x float>* %x, align 32 %0 = bitcast <8 x float> %y to <8 x i32> @@ -245,11 +344,17 @@ ;;; Test that basic 2 x i64 logic use the integer version on AVX define <2 x i64> @vpandn(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: vpandn: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-NEXT: vpandn %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq +; AVX-LABEL: vpandn: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: vpandn: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 +; AVX512F-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: retq entry: ; Force the execution domain with an add. %a2 = add <2 x i64> %a, @@ -259,11 +364,17 @@ } define <2 x i64> @vpand(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: vpand: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: retq +; AVX-LABEL: vpand: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX512F-LABEL: vpand: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq entry: ; Force the execution domain with an add. %a2 = add <2 x i64> %a, Index: test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- test/CodeGen/X86/avx-vperm2x128.ll +++ test/CodeGen/X86/avx-vperm2x128.ll @@ -620,7 +620,8 @@ ; AVX2-LABEL: ld0_hi0_lo1_8i32: ; AVX2: ## BB#0: ## %entry ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq entry: %a = load <8 x i32>, <8 x i32> * %pa @@ -643,7 +644,8 @@ ; AVX2-LABEL: ld1_hi0_hi1_8i32: ; AVX2: ## BB#0: ## %entry ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq entry: %b = load <8 x i32>, <8 x i32> * %pb Index: test/CodeGen/X86/avx2-conversions.ll =================================================================== --- test/CodeGen/X86/avx2-conversions.ll +++ test/CodeGen/X86/avx2-conversions.ll @@ -101,13 +101,15 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind { ; X32-LABEL: zext_8i8_8i32: ; X32: ## BB#0: -; X32-NEXT: vpand LCPI6_0, %xmm0, %xmm0 +; X32-NEXT: vpbroadcastw LCPI6_0, %xmm1 +; X32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X32-NEXT: retl ; ; X64-LABEL: zext_8i8_8i32: ; X64: ## BB#0: -; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1 +; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: retq %B = zext <8 x i8> %A to <8 x i32> Index: test/CodeGen/X86/avx2-nontemporal.ll =================================================================== --- test/CodeGen/X86/avx2-nontemporal.ll +++ test/CodeGen/X86/avx2-nontemporal.ll @@ -10,20 +10,20 @@ ; X32-NEXT: andl $-32, %esp ; X32-NEXT: subl $32, %esp ; X32-NEXT: vmovdqa 104(%ebp), %ymm3 -; X32-NEXT: vmovdqa 72(%ebp), %ymm4 -; X32-NEXT: vmovdqa 40(%ebp), %ymm5 +; X32-NEXT: vmovdqa 40(%ebp), %ymm4 ; X32-NEXT: movl 8(%ebp), %eax -; X32-NEXT: vaddps .LCPI0_0, %ymm0, %ymm0 +; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm0, %ymm0 ; X32-NEXT: vmovntps %ymm0, (%eax) -; X32-NEXT: vpaddq .LCPI0_1, %ymm2, %ymm0 +; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm2, %ymm0 ; X32-NEXT: vmovntdq %ymm0, (%eax) -; X32-NEXT: vaddpd .LCPI0_2, %ymm1, %ymm0 +; X32-NEXT: vaddpd {{\.LCPI.*}}, %ymm1, %ymm0 ; X32-NEXT: vmovntpd %ymm0, (%eax) -; X32-NEXT: vpaddd .LCPI0_3, %ymm5, %ymm0 +; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm4, %ymm0 ; X32-NEXT: vmovntdq %ymm0, (%eax) -; X32-NEXT: vpaddw .LCPI0_4, %ymm4, %ymm0 +; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X32-NEXT: vpaddw 72(%ebp), %ymm0, %ymm0 ; X32-NEXT: vmovntdq %ymm0, (%eax) -; X32-NEXT: vpaddb .LCPI0_5, %ymm3, %ymm0 +; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm3, %ymm0 ; X32-NEXT: vmovntdq %ymm0, (%eax) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp @@ -40,9 +40,11 @@ ; X64-NEXT: vmovntpd %ymm0, (%rdi) ; X64-NEXT: vpaddd {{.*}}(%rip), %ymm3, %ymm0 ; X64-NEXT: vmovntdq %ymm0, (%rdi) -; X64-NEXT: vpaddw {{.*}}(%rip), %ymm4, %ymm0 +; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-NEXT: vpaddw %ymm0, %ymm4, %ymm0 ; X64-NEXT: vmovntdq %ymm0, (%rdi) -; X64-NEXT: vpaddb {{.*}}(%rip), %ymm5, %ymm0 +; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 +; X64-NEXT: vpaddb %ymm0, %ymm5, %ymm0 ; X64-NEXT: vmovntdq %ymm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq Index: test/CodeGen/X86/avx512-arith.ll =================================================================== --- test/CodeGen/X86/avx512-arith.ll +++ test/CodeGen/X86/avx512-arith.ll @@ -265,7 +265,8 @@ define <8 x double> @mulpd512fold(<8 x double> %y) { ; CHECK-LABEL: mulpd512fold: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmulpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq entry: %mul.i = fmul <8 x double> %y, @@ -285,7 +286,8 @@ define <16 x float> @mulps512fold(<16 x float> %y) { ; CHECK-LABEL: mulps512fold: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq entry: %mul.i = fmul <16 x float> %y, @@ -305,7 +307,8 @@ define <8 x double> @divpd512fold(<8 x double> %y) { ; CHECK-LABEL: divpd512fold: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vdivpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq entry: %div.i = fdiv <8 x double> %y, @@ -323,10 +326,35 @@ } define <16 x float> @divps512fold(<16 x float> %y) { -; CHECK-LABEL: divps512fold: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0 -; CHECK-NEXT: retq +; AVX512F-LABEL: divps512fold: +; AVX512F: ## BB#0: ## %entry +; AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vdivps %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: divps512fold: +; AVX512VL: ## BB#0: ## %entry +; AVX512VL-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512VL-NEXT: vdivps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: divps512fold: +; AVX512BW: ## BB#0: ## %entry +; AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vdivps %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: divps512fold: +; AVX512DQ: ## BB#0: ## %entry +; AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vdivps %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: divps512fold: +; SKX: ## BB#0: ## %entry +; SKX-NEXT: vbroadcastf32x8 {{.*#+}} zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; SKX-NEXT: vdivps %zmm1, %zmm0, %zmm0 +; SKX-NEXT: retq entry: %div.i = fdiv <16 x float> %y, ret <16 x float> %div.i Index: test/CodeGen/X86/avx512-ext.ll =================================================================== --- test/CodeGen/X86/avx512-ext.ll +++ test/CodeGen/X86/avx512-ext.ll @@ -1987,7 +1987,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { ; KNL-LABEL: zext_4xi1_to_4x32: ; KNL: ## BB#0: -; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 ; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -1996,9 +1996,9 @@ ; ; SKX-LABEL: zext_4xi1_to_4x32: ; SKX: ## BB#0: -; SKX-NEXT: vmovdqa64 {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SKX-NEXT: vpandq %xmm2, %xmm1, %xmm1 -; SKX-NEXT: vpandq %xmm2, %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; SKX-NEXT: vpandd %xmm2, %xmm1, %xmm1 +; SKX-NEXT: vpandd %xmm2, %xmm0, %xmm0 ; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 ; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} ; SKX-NEXT: retq @@ -2010,7 +2010,7 @@ define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 { ; KNL-LABEL: zext_2xi1_to_2xi64: ; KNL: ## BB#0: -; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %xmm2 ; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 @@ -2019,7 +2019,7 @@ ; ; SKX-LABEL: zext_2xi1_to_2xi64: ; SKX: ## BB#0: -; SKX-NEXT: vmovdqa64 {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %xmm2 ; SKX-NEXT: vpandq %xmm2, %xmm1, %xmm1 ; SKX-NEXT: vpandq %xmm2, %xmm0, %xmm0 ; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 Index: test/CodeGen/X86/avx512-gather-scatter-intrin.ll =================================================================== --- test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -17,7 +17,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2} -; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} ; CHECK-NEXT: retq %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) @@ -32,7 +33,8 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2} -; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm2 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} ; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) @@ -47,7 +49,8 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2} -; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} ; CHECK-NEXT: retq %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) @@ -62,7 +65,8 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2} -; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} ; CHECK-NEXT: retq %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) @@ -89,7 +93,8 @@ ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2} -; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} ; CHECK-NEXT: retq %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) @@ -104,7 +109,8 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2} -; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} ; CHECK-NEXT: retq %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) @@ -119,7 +125,8 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2} -; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} ; CHECK-NEXT: retq %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) @@ -134,7 +141,8 @@ ; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: kmovq %k1, %k2 ; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2} -; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm2 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} ; CHECK-NEXT: retq %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) @@ -243,7 +251,8 @@ ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: kxnorw %k0, %k0, %k2 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} -; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} ; CHECK-NEXT: retq %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4) @@ -300,7 +309,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} -; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) @@ -336,7 +345,7 @@ ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} ; CHECK-NEXT: kxnorw %k0, %k0, %k1 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1} -; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8) %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8) @@ -474,7 +483,7 @@ ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1} -; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -960,8 +960,9 @@ define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { ; KNL-LABEL: test_build_vec_v64i1: ; KNL: ## BB#0: +; KNL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; KNL-NEXT: vandps %ymm2, %ymm1, %ymm1 ; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; KNL-NEXT: retq ; ; SKX-LABEL: test_build_vec_v64i1: Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2346,14 +2346,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] -; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpsravw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] -; AVX512F-32-NEXT: vpsravw {{\.LCPI.*}}, %zmm0, %zmm0 +; AVX512F-32-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-32-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-32-NEXT: vpsravw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> , <32 x i16> , Index: test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll =================================================================== --- test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -0,0 +1,460 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2-32 --check-prefix=ALL --check-prefix=NO-AVX512BW +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F-32 --check-prefix=ALL --check-prefix=NO-AVX512BW +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW-32 --check-prefix=ALL --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2-64 --check-prefix=ALL --check-prefix=ALL64 --check-prefix=NO-AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F-64 --check-prefix=ALL --check-prefix=ALL64 --check-prefix=NO-AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512BW-64 --check-prefix=ALL --check-prefix=ALL64 --check-prefix=AVX512BW + +;===-----------------------------------------------------------------------------=== +; This test checks the ability to recognize a cross element pattern of +; constants and perform the load via broadcasting a smaller constant +; vector. +; For example: +; => broadcast of the constant vector +;===-----------------------------------------------------------------------------=== + +define <16 x i8> @f16xi8_i16() { +; ALL-LABEL: f16xi8_i16: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastw {{\.LCPI.*}}, %xmm0 + ret <16 x i8> +} + +define <16 x i8> @f16xi8_i32() { +; ALL-LABEL: f16xi8_i32: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm0 + ret <16 x i8> +} + +define <16 x i8> @f16xi8_i64() { +; ALL64-LABEL: f16xi8_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 + ret <16 x i8> +} + +define <32 x i8> @f32xi8_i16() { +; ALL-LABEL: f32xi8_i16: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm0 + ret <32 x i8> +} + +define <32 x i8> @f32xi8_i32() { +; ALL-LABEL: f32xi8_i32: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastss {{\.LCPI.*}}, %ymm0 + ret <32 x i8> +} + +define <32 x i8> @f32xi8_i64() { +; ALL64-LABEL: f32xi8_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 + ret <32 x i8> +} + +define <32 x i8> @f32xi8_i128() { +; ALL-LABEL: f32xi8_i128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] + ret <32 x i8> +} + +define <64 x i8> @f64xi8_i16() { +; NO-AVX512BW-LABEL: f64xi8_i16: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm0 +; NO-AVX512BW-NEXT: vmovdqa %ymm0, %ymm1 +; +; AVX512BW-LABEL: f64xi8_i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %zmm0 + ret <64 x i8> +} + +define <64 x i8> @f64i8_i32() { +; AVX512BW-LABEL: f64i8_i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcastss {{\.LCPI.*}}, %zmm0 +; +; NO-AVX512BW-LABEL: f64i8_i32: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vbroadcastss {{\.LCPI.*}}, %ymm0 +; NO-AVX512BW-NEXT: vmovaps %ymm0, %ymm1 + ret <64 x i8> +} + +define <64 x i8> @f64xi8_i64() { +; AVX2-64-LABEL: f64xi8_i64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 +; AVX2-64-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f64xi8_i64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 +; AVX512F-64-NEXT: vmovaps %ymm0, %ymm1 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64xi8_i64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcastsd {{.*}}(%rip), %zmm0 +; AVX512BW-64-NEXT: retq + ret <64 x i8> +} + +define <64 x i8> @f64xi8_i128() { +; NO-AVX512BW-LABEL: f64xi8_i128: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; NO-AVX512BW-NEXT: vmovaps %ymm0, %ymm1 +; +; AVX512BW-LABEL: f64xi8_i128: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] + ret <64 x i8> +} + +define <64 x i8> @f64xi8_i256() { +; AVX512BW-LABEL: f64xi8_i256: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] + ret <64 x i8> +} + +define <8 x i16> @f8xi16_i32() { +; ALL-LABEL: f8xi16_i32: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm0 +; ALL-NEXT: ret + ret <8 x i16> +} + +define <8 x i16> @f8xi16_i64() { +; ALL64-LABEL: f8xi16_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 +; ALL64-NEXT: retq + ret <8 x i16> +} + +define <16 x i16> @f16xi16_i32() { +; ALL-LABEL: f16xi16_i32: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastss {{\.LCPI.*}}, %ymm0 +; ALL-NEXT: ret + ret <16 x i16> +} + +define <16 x i16> @f16xi16_i64() { +; ALL64-LABEL: f16xi16_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 +; ALL64-NEXT: retq + ret <16 x i16> +} + +define <16 x i16> @f16xi16_i128() { +; ALL-LABEL: f16xi16_i128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; ALL-NEXT: ret + ret <16 x i16> +} + +define <32 x i16> @f32xi16_i32() { +; NO-AVX512BW-LABEL: f32xi16_i32: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vbroadcastss {{\.LCPI.*}}, %ymm0 +; NO-AVX512BW-NEXT: vmovaps %ymm0, %ymm1 +; NO-AVX512BW-NEXT: ret +; +; AVX512BW-LABEL: f32xi16_i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcastss {{\.LCPI.*}}, %zmm0 +; AVX512BW-NEXT: ret + ret <32 x i16> +} + +define <32 x i16> @f32xi16_i64() { +; AVX2-64-LABEL: f32xi16_i64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 +; AVX2-64-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f32xi16_i64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 +; AVX512F-64-NEXT: vmovaps %ymm0, %ymm1 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f32xi16_i64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcastsd {{.*}}(%rip), %zmm0 +; AVX512BW-64-NEXT: retq + ret <32 x i16> +} + +define <32 x i16> @f32xi16_i128() { +; NO-AVX512BW-LABEL: f32xi16_i128: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; NO-AVX512BW-NEXT: vmovaps %ymm0, %ymm1 +; NO-AVX512BW-NEXT: ret +; +; AVX512BW-LABEL: f32xi16_i128: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: ret + ret <32 x i16> +} + +define <32 x i16> @f32xi16_i256() { +; AVX512BW-LABEL: f32xi16_i256: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: ret + ret <32 x i16> +} + +define <4 x i32> @f4xi32_i64() { +; ALL64-LABEL: f4xi32_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 +; ALL64-NEXT: retq + ret <4 x i32> +} + +define <8 x i32> @f8xi32_i64() { +; ALL64-LABEL: f8xi32_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 +; ALL64-NEXT: retq + ret <8 x i32> +} + +define <8 x i32> @f8xi32_i128() { +; ALL-LABEL: f8xi32_i128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; ALL-NEXT: ret + ret <8 x i32> +} + +define <16 x i32> @f16xi32_i64() { +; AVX2-64-LABEL: f16xi32_i64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 +; AVX2-64-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xi32_i64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastsd {{.*}}(%rip), %zmm0 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f16xi32_i64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcastsd {{.*}}(%rip), %zmm0 +; AVX512BW-64-NEXT: retq + ret <16 x i32> +} + +define <16 x i32> @f16xi32_i128() { +; AVX2-32-LABEL: f16xi32_i128: +; AVX2-32: # BB#0: +; AVX2-32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-32-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-32-NEXT: retl +; +; AVX512F-32-LABEL: f16xi32_i128: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-32-NEXT: retl +; +; AVX2-64-LABEL: f16xi32_i128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-64-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xi32_i128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: retq + ret <16 x i32> +} + +define <4 x i64> @f4xi64_i128() { +; ALL-LABEL: f4xi64_i128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; ALL-NEXT: ret + ret <4 x i64> +} + +define <8 x i64> @f8xi64_i128() { +; AVX2-32-LABEL: f8xi64_i128: +; AVX2-32: # BB#0: +; AVX2-32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-32-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-32-NEXT: retl +; +; AVX512F-32-LABEL: f8xi64_i128: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-32-NEXT: retl +; +; AVX2-64-LABEL: f8xi64_i128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-64-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xi64_i128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: retq + ret <8 x i64> +} + +define <8 x i64> @f8xi64_i256() { +; AVX512F-32-LABEL: f8xi64_i256: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-32-NEXT: retl +; +; AVX512F-64-LABEL: f8xi64_i256: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: retq + ret <8 x i64> +} + +define <4 x float> @f4xf32_f64() { +; ALL64-LABEL: f4xf32_f64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 +; ALL64-NEXT: retq + ret <4 x float> +} + +define <8 x float> @f8xf32_f64() { +; ALL64-LABEL: f8xf32_f64: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 +; ALL64-NEXT: retq + ret <8 x float> +} + +define <8 x float> @f8xf32_f128() { +; ALL-LABEL: f8xf32_f128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; ALL-NEXT: ret + ret <8 x float> +} + +define <16 x float> @f16xf32_f64() { +; AVX2-64-LABEL: f16xf32_f64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 +; AVX2-64-NEXT: vmovaps %ymm0, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xf32_f64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastsd {{.*}}(%rip), %zmm0 +; AVX512F-64-NEXT: retq + ret <16 x float> +} + +define <16 x float> @f16xf32_f128() { +; AVX2-32-LABEL: f16xf32_f128: +; AVX2-32: # BB#0: +; AVX2-32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-32-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-32-NEXT: retl +; +; AVX512F-32-LABEL: f16xf32_f128: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-32-NEXT: retl +; +; AVX2-64-LABEL: f16xf32_f128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-64-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xf32_f128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: retq + ret <16 x float> +} + +define <16 x float> @f16xf32_f256() { +; AVX512F-32-LABEL: f16xf32_f256: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-32-NEXT: retl +; +; AVX512F-64-LABEL: f16xf32_f256: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: retq + ret <16 x float> +} + +define <4 x double> @f4xf64_f128() { +; ALL-LABEL: f4xf64_f128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; ALL-NEXT: ret + ret <4 x double> +} + +define <8 x double> @f8xf64_f128() { +; AVX2-32-LABEL: f8xf64_f128: +; AVX2-32: # BB#0: +; AVX2-32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-32-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-32-NEXT: retl +; +; AVX512F-32-LABEL: f8xf64_f128: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-32-NEXT: retl +; +; AVX2-64-LABEL: f8xf64_f128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-64-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xf64_f128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: retq + ret <8 x double> +} + +define <8 x double> @f8xf64_f256() { +; AVX512F-32-LABEL: f8xf64_f256: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-32-NEXT: retl +; +; AVX512F-64-LABEL: f8xf64_f256: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: retq + ret <8 x double> +} + Index: test/CodeGen/X86/clear_upper_vector_element_bits.ll =================================================================== --- test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -650,10 +650,16 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: _clearupper8xi16c: -; AVX: # BB#0: -; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: _clearupper8xi16c: +; AVX1: # BB#0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: _clearupper8xi16c: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %r = and <8 x i16> , %0 ret <8 x i16> %r } Index: test/CodeGen/X86/combine-sdiv.ll =================================================================== --- test/CodeGen/X86/combine-sdiv.ll +++ test/CodeGen/X86/combine-sdiv.ll @@ -69,7 +69,8 @@ ; ; AVX-LABEL: combine_vec_sdiv_by_pos0: ; AVX: # BB#0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <4 x i32> %x, @@ -94,7 +95,8 @@ ; ; AVX-LABEL: combine_vec_sdiv_by_pos1: ; AVX: # BB#0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrd $1, %xmm0, %eax ; AVX-NEXT: shrl $2, %eax ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1 Index: test/CodeGen/X86/combine-srem.ll =================================================================== --- test/CodeGen/X86/combine-srem.ll +++ test/CodeGen/X86/combine-srem.ll @@ -66,7 +66,8 @@ ; ; AVX-LABEL: combine_vec_srem_by_pos1: ; AVX: # BB#0: -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrd $3, %xmm0, %eax ; AVX-NEXT: andl $15, %eax ; AVX-NEXT: vmovd %eax, %xmm1 Index: test/CodeGen/X86/promote-vec3.ll =================================================================== --- test/CodeGen/X86/promote-vec3.ll +++ test/CodeGen/X86/promote-vec3.ll @@ -58,7 +58,8 @@ ; AVX-64-NEXT: vmovd %edi, %xmm0 ; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 -; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vmovd %xmm0, %eax ; AVX-64-NEXT: vpextrw $2, %xmm0, %edx ; AVX-64-NEXT: vpextrw $4, %xmm0, %ecx Index: test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- test/CodeGen/X86/vec_fp_to_int.ll +++ test/CodeGen/X86/vec_fp_to_int.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 @@ -1910,14 +1911,19 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = <4294967295,1,u,u> ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_2f64_to_2i32_const: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> -; AVX-NEXT: retq +; AVX1-LABEL: fptosi_2f64_to_2i32_const: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptosi_2f64_to_2i32_const: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: fptosi_2f64_to_2i32_const: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> +; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 ; AVX512-NEXT: retq %cvt = fptosi <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> @@ -1988,14 +1994,19 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = <2,4,u,u> ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_2f64_to_2i32_const: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> -; AVX-NEXT: retq +; AVX1-LABEL: fptoui_2f64_to_2i32_const: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> +; AVX1-NEXT: retq +; +; AVX2-LABEL: fptoui_2f64_to_2i32_const: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: fptoui_2f64_to_2i32_const: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> +; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %xmm0 ; AVX512-NEXT: retq %cvt = fptoui <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> Index: test/CodeGen/X86/vec_shift6.ll =================================================================== --- test/CodeGen/X86/vec_shift6.ll +++ test/CodeGen/X86/vec_shift6.ll @@ -101,12 +101,14 @@ ; ; AVX2-LABEL: test5: ; AVX2: # BB#0: -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test5: ; AVX512: # BB#0: -; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %shl = shl <16 x i16> %a, ret <16 x i16> %shl @@ -126,12 +128,14 @@ ; ; AVX2-LABEL: test6: ; AVX2: # BB#0: -; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test6: ; AVX512: # BB#0: -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %shl = shl <8 x i32> %a, ret <8 x i32> %shl @@ -153,14 +157,14 @@ ; ; AVX2-LABEL: test7: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test7: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: retq @@ -183,14 +187,15 @@ ; ; AVX2-LABEL: test8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test8: ; AVX512: # BB#0: -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %shl = shl <16 x i32> %a, ret <16 x i32> %shl @@ -222,7 +227,8 @@ ; ; AVX512-LABEL: test9: ; AVX512: # BB#0: -; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %shl = shl <8 x i64> %a, ret <8 x i64> %shl Index: test/CodeGen/X86/vector-blend.ll =================================================================== --- test/CodeGen/X86/vector-blend.ll +++ test/CodeGen/X86/vector-blend.ll @@ -278,11 +278,17 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_i8: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: vsel_i8: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i8: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq entry: %vsel = select <16 x i1> , <16 x i8> %v1, <16 x i8> %v2 ret <16 x i8> %vsel @@ -669,7 +675,7 @@ ; ; AVX2-LABEL: constant_pblendvb_avx2: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq entry: Index: test/CodeGen/X86/vector-rotate-256.ll =================================================================== --- test/CodeGen/X86/vector-rotate-256.ll +++ test/CodeGen/X86/vector-rotate-256.ll @@ -633,7 +633,7 @@ ; ; AVX2-LABEL: constant_rotate_v32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 @@ -645,7 +645,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 Index: test/CodeGen/X86/vector-shift-ashr-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-256.ll +++ test/CodeGen/X86/vector-shift-ashr-256.ll @@ -876,7 +876,7 @@ ; ; AVX2-LABEL: constant_shift_v32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] @@ -925,7 +925,7 @@ ; ; AVX512-LABEL: constant_shift_v32i8: ; AVX512: ## BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] Index: test/CodeGen/X86/vector-shift-ashr-512.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-512.ll +++ test/CodeGen/X86/vector-shift-ashr-512.ll @@ -1010,17 +1010,25 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { ; ALL-LABEL: constant_shift_v8i64: ; ALL: ## BB#0: -; ALL-NEXT: vpsravq {{.*}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; ALL-NEXT: vpsravq %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = ashr <8 x i64> %a, ret <8 x i64> %shift } define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { -; ALL-LABEL: constant_shift_v16i32: -; ALL: ## BB#0: -; ALL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; ALL-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v16i32: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v16i32: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %shift = ashr <16 x i32> %a, ret <16 x i32> %shift } @@ -1050,7 +1058,8 @@ ; ; AVX512BW-LABEL: constant_shift_v32i16: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1059,7 +1068,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: ; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] Index: test/CodeGen/X86/vector-shift-lshr-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-lshr-256.ll +++ test/CodeGen/X86/vector-shift-lshr-256.ll @@ -714,7 +714,7 @@ ; ; AVX2-LABEL: constant_shift_v32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 @@ -751,7 +751,7 @@ ; ; AVX512-LABEL: constant_shift_v32i8: ; AVX512: ## BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 Index: test/CodeGen/X86/vector-shift-lshr-512.ll =================================================================== --- test/CodeGen/X86/vector-shift-lshr-512.ll +++ test/CodeGen/X86/vector-shift-lshr-512.ll @@ -973,17 +973,25 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { ; ALL-LABEL: constant_shift_v8i64: ; ALL: ## BB#0: -; ALL-NEXT: vpsrlvq {{.*}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; ALL-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = lshr <8 x i64> %a, ret <8 x i64> %shift } define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { -; ALL-LABEL: constant_shift_v16i32: -; ALL: ## BB#0: -; ALL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; ALL-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v16i32: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v16i32: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %shift = lshr <16 x i32> %a, ret <16 x i32> %shift } @@ -1013,7 +1021,8 @@ ; ; AVX512BW-LABEL: constant_shift_v32i16: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1025,7 +1034,7 @@ ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm2 Index: test/CodeGen/X86/vector-shift-shl-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-shl-256.ll +++ test/CodeGen/X86/vector-shift-shl-256.ll @@ -623,7 +623,7 @@ ; ; AVX2-LABEL: constant_shift_v32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 @@ -657,7 +657,7 @@ ; ; AVX512-LABEL: constant_shift_v32i8: ; AVX512: ## BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX512-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 Index: test/CodeGen/X86/vector-shift-shl-512.ll =================================================================== --- test/CodeGen/X86/vector-shift-shl-512.ll +++ test/CodeGen/X86/vector-shift-shl-512.ll @@ -967,17 +967,25 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { ; ALL-LABEL: constant_shift_v8i64: ; ALL: ## BB#0: -; ALL-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; ALL-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %shift = shl <8 x i64> %a, ret <8 x i64> %shift } define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { -; ALL-LABEL: constant_shift_v16i32: -; ALL: ## BB#0: -; ALL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; ALL-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v16i32: +; AVX512DQ: ## BB#0: +; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v16i32: +; AVX512BW: ## BB#0: +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %shift = shl <16 x i32> %a, ret <16 x i32> %shift } @@ -992,7 +1000,8 @@ ; ; AVX512BW-LABEL: constant_shift_v32i16: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1004,7 +1013,7 @@ ; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2 Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -415,17 +415,17 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu8 {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpbroadcastw {{.*}}(%rip), %xmm2 +; AVX2OR512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -454,17 +454,17 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu8 {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; AVX2OR512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -475,14 +475,20 @@ ; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpandq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle @@ -512,17 +518,17 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu8 {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] -; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpbroadcastq {{.*}}(%rip), %xmm2 +; AVX2OR512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1029,17 +1029,11 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu8 {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpbroadcastw {{.*}}(%rip), %ymm2 +; AVX2OR512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1053,30 +1047,31 @@ ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu8 {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpbroadcastw {{.*}}(%rip), %ymm2 +; AVX2OR512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) { -; AVX1OR2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; AVX1: # BB#0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpandq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1 +; AVX512VL-NEXT: vpandq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle @@ -1136,7 +1131,7 @@ ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpbroadcastw {{.*}}(%rip), %ymm2 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1146,7 +1141,7 @@ ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX512VL-NEXT: vmovdqu8 {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512VL-NEXT: vpbroadcastw {{.*}}(%rip), %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -1387,21 +1382,13 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] -; AVX512VL-NEXT: vmovdqu8 {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] +; AVX2OR512VL-NEXT: vpbroadcastw {{.*}}(%rip), %ymm2 +; AVX2OR512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } @@ -1416,21 +1403,13 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] -; AVX512VL-NEXT: vmovdqu8 {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] +; AVX2OR512VL-NEXT: vpbroadcastw {{.*}}(%rip), %ymm2 +; AVX2OR512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } Index: test/CodeGen/X86/vector-shuffle-combining-avx2.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -235,14 +235,16 @@ ; X32: # BB#0: ; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: vpbroadcastd %xmm0, %ymm0 -; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_permd_as_vpbroadcastd256: ; X64: # BB#0: ; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: vpbroadcastd %xmm0, %ymm0 -; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer) @@ -269,14 +271,16 @@ ; X32: # BB#0: ; X32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X32-NEXT: vpbroadcastq %xmm0, %ymm0 -; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_permd_as_vpbroadcastq256: ; X64: # BB#0: ; X64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; X64-NEXT: vpbroadcastq %xmm0, %ymm0 -; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> ) Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -1547,13 +1547,15 @@ ; ; AVX2-LABEL: zext_8i8_to_8i32: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq ; ; AVX512-LABEL: zext_8i8_to_8i32: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512-NEXT: retq entry: