Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -1300,53 +1300,6 @@ Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_shuf_f32x4_256 : - GCCBuiltin<"__builtin_ia32_shuf_f32x4_256_mask">, - Intrinsic<[llvm_v8f32_ty], - [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_shuf_f32x4 : - GCCBuiltin<"__builtin_ia32_shuf_f32x4_mask">, - Intrinsic<[llvm_v16f32_ty], - [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_shuf_f64x2_256 : - GCCBuiltin<"__builtin_ia32_shuf_f64x2_256_mask">, - Intrinsic<[llvm_v4f64_ty], - [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_shuf_f64x2 : - GCCBuiltin<"__builtin_ia32_shuf_f64x2_mask">, - Intrinsic<[llvm_v8f64_ty], - [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_shuf_i32x4_256 : - GCCBuiltin<"__builtin_ia32_shuf_i32x4_256_mask">, - Intrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_shuf_i32x4 : - GCCBuiltin<"__builtin_ia32_shuf_i32x4_mask">, - Intrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_shuf_i64x2_256 : - GCCBuiltin<"__builtin_ia32_shuf_i64x2_256_mask">, - Intrinsic<[llvm_v4i64_ty], - [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v4i64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_shuf_i64x2 : - GCCBuiltin<"__builtin_ia32_shuf_i64x2_mask">, - Intrinsic<[llvm_v8i64_ty], - [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty], - [IntrNoMem]>; } // Vector blend Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -75,6 +75,8 @@ if (Name=="ssse3.pabs.b.128" || // Added in 6.0 Name=="ssse3.pabs.w.128" || // Added in 6.0 Name=="ssse3.pabs.d.128" || // Added in 6.0 + Name.startswith("avx512.mask.shuf.i") || // Added in 6.0 + Name.startswith("avx512.mask.shuf.f") || // Added in 6.0 Name.startswith("avx2.pabs.") || // Added in 6.0 Name.startswith("avx512.mask.pabs.") || // Added in 6.0 Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0 @@ -1266,7 +1268,29 @@ else Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()), { 0, 1, 2, 3, 0, 1, 2, 3 }); - } else if (IsX86 && (Name.startswith("avx512.mask.broadcastf") || + } else if (IsX86 && (Name.startswith("avx512.mask.shuf.i") || + Name.startswith("avx512.mask.shuf.f"))) { + unsigned Imm = cast(CI->getArgOperand(2))->getZExtValue(); + Type *VT = CI->getType(); + unsigned NumLanes = VT->getPrimitiveSizeInBits() / 128; + unsigned NumElementsInLane = 128 / VT->getScalarSizeInBits(); + unsigned ControlBitsMask = NumLanes - 1; + unsigned NumControlBits = NumLanes / 2; + SmallVector ShuffleMask(0); + + for (unsigned l = 0; l != NumLanes; ++l) { + unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask; + // We actually need the other source. + if (l >= NumLanes / 2) + LaneMask += NumLanes; + for (unsigned i = 0; i != NumElementsInLane; ++i) + ShuffleMask.push_back(LaneMask * NumElementsInLane + i); + } + Rep = Builder.CreateShuffleVector(CI->getArgOperand(0), + CI->getArgOperand(1), ShuffleMask); + Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep, + CI->getArgOperand(3)); + }else if (IsX86 && (Name.startswith("avx512.mask.broadcastf") || Name.startswith("avx512.mask.broadcasti"))) { unsigned NumSrcElts = CI->getArgOperand(0)->getType()->getVectorNumElements(); Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5235,6 +5235,15 @@ return V; } +static bool hasOpcodeUserThroughBitcast(SDNode *N, unsigned Opcode) { + if (N->getOpcode() == ISD::BITCAST) + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; + ++UI) + if (UI.getUse().getUser()->getOpcode() == Opcode) + return true; + return false; +} + static const Constant *getTargetConstantFromNode(SDValue Op) { Op = peekThroughBitcasts(Op); @@ -29071,7 +29080,34 @@ if (SDValue HAddSub = foldShuffleOfHorizOp(N)) return HAddSub; } - + + // Attempt to use SHUF128 for masked X86ISD::VPERM2X128 + // from this paterrn: + // a. t0 = VPERM2X128 + // b. t1 = VSELECT + // + // Create SHUF128 + if (N->getOpcode() == X86ISD::VPERM2X128) { + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; + ++UI) { + if (UI.getUse().getUser()->getOpcode() == ISD::VSELECT || + hasOpcodeUserThroughBitcast(UI.getUse().getUser(), ISD::VSELECT)) { + uint64_t Imm = N->getConstantOperandVal(2); + if (((Imm & 0x02) == 0) && ((Imm & 0x20) == 0x20)) { + unsigned NewImm = (Imm & 0x01); + if (Imm & 0x10) + NewImm = (NewImm | 0x02); + SDLoc DL(N); + EVT VT = UI.getUse().getUser()->getValueType(0); + SDValue shuffle = DAG.getNode(X86ISD::SHUF128, DL, VT, + N->getOperand(0), N->getOperand(1), + DAG.getConstant(NewImm, DL, MVT::i8)); + return DAG.getBitcast(N->getSimpleValueType(0), shuffle); + } + } + } + } + // During Type Legalization, when promoting illegal vector types, // the backend might introduce new shuffle dag nodes and bitcasts. // @@ -30218,6 +30254,7 @@ MVT EltVT = VT.getVectorElementType(); SDLoc DL(Op.getNode()); + auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1, SDValue Op2) { Op0 = DAG.getBitcast(VT, Op0); Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -1051,22 +1051,6 @@ X86ISD::SCALEFS, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::SCALEFS, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT, Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -4,6 +4,233 @@ ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c +define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_shuffle_f32x4: +; X32: # BB#0: # %entry +; X32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,4,5] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_shuffle_f32x4: +; X64: # BB#0: # %entry +; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,4,5] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <16 x float> %__A, <16 x float> undef, <16 x i32> + ret <16 x float> %shuffle +} + +define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_mask_shuffle_f32x4: +; X32: # BB#0: # %entry +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,8,9,10,11] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_shuffle_f32x4: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,8,9,10,11] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <16 x float> %__A, <16 x float> undef, <16 x i32> + %0 = bitcast i16 %__U to <16 x i1> + %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W + ret <16 x float> %1 +} + +define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_maskz_shuffle_f32x4: +; X32: # BB#0: # %entry +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,8,9,10,11,8,9,10,11] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_shuffle_f32x4: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,8,9,10,11,8,9,10,11] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <16 x float> %__A, <16 x float> undef, <16 x i32> + %0 = bitcast i16 %__U to <16 x i1> + %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer + ret <16 x float> %1 +} + +define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_shuffle_f64x2: +; X32: # BB#0: # %entry +; X32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_shuffle_f64x2: +; X64: # BB#0: # %entry +; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> + ret <8 x double> %shuffle +} + +define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_mask_shuffle_f64x2: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_shuffle_f64x2: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> + %0 = bitcast i8 %__U to <8 x i1> + %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W + ret <8 x double> %1 +} + +define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_maskz_shuffle_f64x2: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_shuffle_f64x2: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> + %0 = bitcast i8 %__U to <8 x i1> + %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer + ret <8 x double> %1 +} + +define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_shuffle_i32x4: +; X32: # BB#0: # %entry +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_shuffle_i32x4: +; X64: # BB#0: # %entry +; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> + ret <8 x i64> %shuffle +} + +define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_mask_shuffle_i32x4: +; X32: # BB#0: # %entry +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_shuffle_i32x4: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> + %0 = bitcast <8 x i64> %shuffle to <16 x i32> + %1 = bitcast <8 x i64> %__W to <16 x i32> + %2 = bitcast i16 %__U to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1 + %4 = bitcast <16 x i32> %3 to <8 x i64> + ret <8 x i64> %4 +} + +define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_maskz_shuffle_i32x4: +; X32: # BB#0: # %entry +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_shuffle_i32x4: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> + %0 = bitcast <8 x i64> %shuffle to <16 x i32> + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer + %3 = bitcast <16 x i32> %2 to <8 x i64> + ret <8 x i64> %3 +} + +define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_shuffle_i64x2: +; X32: # BB#0: # %entry +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_shuffle_i64x2: +; X64: # BB#0: # %entry +; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> + ret <8 x i64> %shuffle +} + +define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_mask_shuffle_i64x2: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_mask_shuffle_i64x2: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> + %0 = bitcast i8 %__U to <8 x i1> + %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W + ret <8 x i64> %1 +} + +define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm512_maskz_shuffle_i64x2: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_shuffle_i64x2: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> + %0 = bitcast i8 %__U to <8 x i1> + %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer + ret <8 x i64> %1 +} + + + define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) { ; X32-LABEL: test_mm512_mask_set1_epi32: ; X32: # BB#0: # %entry Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1648,6 +1648,76 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4: +; CHECK: ## BB#0: +; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm3 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) + %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2: +; CHECK: ## BB#0: +; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 +; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) + %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) + %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res3, %res2 + ret <8 x double> %res4 +} + +declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4: +; CHECK: ## BB#0: +; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm3 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2: +; CHECK: ## BB#0: +; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8) define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -3339,75 +3339,6 @@ ret i8 %res13 } -declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 -} - -declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] -; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) - - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 -} - -declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 -} - -declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) { -; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] -; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 -} - declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32) define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) { Index: test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll +++ test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll @@ -14,10 +14,10 @@ define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $-41, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %res = select <8 x i1> , <8 x float> %shuf, <8 x float> %vec3 @@ -27,10 +27,9 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $-41, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %res = select <8 x i1> , <8 x float> %shuf, <8 x float> zeroinitializer @@ -39,10 +38,10 @@ define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $-63, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %res = select <8 x i1> , <8 x float> %shuf, <8 x float> %vec3 @@ -52,10 +51,9 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $-63, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %res = select <8 x i1> , <8 x float> %shuf, <8 x float> zeroinitializer @@ -64,10 +62,10 @@ define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $107, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %res = select <8 x i1> , <8 x float> %shuf, <8 x float> %vec3 @@ -77,10 +75,9 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $107, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %res = select <8 x i1> , <8 x float> %shuf, <8 x float> zeroinitializer @@ -97,10 +94,10 @@ define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $66, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %res = select <8 x i1> , <8 x float> %shuf, <8 x float> %vec3 @@ -110,10 +107,9 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $66, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %res = select <8 x i1> , <8 x float> %shuf, <8 x float> zeroinitializer @@ -131,10 +127,10 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $-24, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -145,10 +141,9 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $-24, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -159,10 +154,10 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $-6, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -173,10 +168,9 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $-6, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -187,10 +181,10 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $-50, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -201,10 +195,9 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $-50, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -224,10 +217,10 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) { ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $-26, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -238,10 +231,9 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $-26, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> @@ -498,10 +490,10 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $13, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> %vec3 @@ -511,10 +503,9 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $13, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> zeroinitializer @@ -523,10 +514,10 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $11, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> %vec3 @@ -536,10 +527,9 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $11, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> zeroinitializer @@ -548,10 +538,10 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $14, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> %vec3 @@ -561,10 +551,9 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $14, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> zeroinitializer @@ -581,10 +570,10 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $12, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> %vec3 @@ -594,10 +583,9 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $12, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> zeroinitializer @@ -615,10 +603,10 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $14, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -629,10 +617,9 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $14, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -643,10 +630,10 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $8, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -657,10 +644,9 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $8, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -671,10 +657,10 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $6, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -685,10 +671,9 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $6, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -708,10 +693,10 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) { ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $13, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -722,10 +707,9 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $13, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> @@ -982,10 +966,10 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $26, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %res = select <8 x i1> , <8 x i32> %shuf, <8 x i32> %vec3 @@ -995,10 +979,9 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $26, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %res = select <8 x i1> , <8 x i32> %shuf, <8 x i32> zeroinitializer @@ -1007,10 +990,10 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $-4, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %res = select <8 x i1> , <8 x i32> %shuf, <8 x i32> %vec3 @@ -1020,10 +1003,9 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $-4, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %res = select <8 x i1> , <8 x i32> %shuf, <8 x i32> zeroinitializer @@ -1032,10 +1014,10 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $51, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %res = select <8 x i1> , <8 x i32> %shuf, <8 x i32> %vec3 @@ -1045,10 +1027,9 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $51, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %res = select <8 x i1> , <8 x i32> %shuf, <8 x i32> zeroinitializer @@ -1065,10 +1046,10 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) { ; CHECK-LABEL: test_8xi32_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $92, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %res = select <8 x i1> , <8 x i32> %shuf, <8 x i32> %vec3 @@ -1078,10 +1059,9 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $92, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %res = select <8 x i1> , <8 x i32> %shuf, <8 x i32> zeroinitializer @@ -1099,10 +1079,11 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: movb $64, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],ymm2[4,5,6,7] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1113,10 +1094,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: movb $64, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1127,10 +1108,11 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: movb $-104, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],ymm2[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1141,10 +1123,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: movb $-104, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1155,10 +1137,11 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: movb $113, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],ymm2[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1169,10 +1152,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: movb $113, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1192,10 +1175,11 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) { ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: movb $45, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],ymm2[0,1,2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1206,10 +1190,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) { ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: movb $45, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] ; CHECK-NEXT: retq %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> @@ -1466,10 +1450,10 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $13, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %res = select <4 x i1> , <4 x i64> %shuf, <4 x i64> %vec3 @@ -1479,10 +1463,9 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $13, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %res = select <4 x i1> , <4 x i64> %shuf, <4 x i64> zeroinitializer @@ -1491,10 +1474,10 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $11, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %res = select <4 x i1> , <4 x i64> %shuf, <4 x i64> %vec3 @@ -1504,10 +1487,9 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $11, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %res = select <4 x i1> , <4 x i64> %shuf, <4 x i64> zeroinitializer @@ -1516,10 +1498,10 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $3, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %res = select <4 x i1> , <4 x i64> %shuf, <4 x i64> %vec3 @@ -1529,10 +1511,9 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: movb $3, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %res = select <4 x i1> , <4 x i64> %shuf, <4 x i64> zeroinitializer @@ -1549,10 +1530,10 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) { ; CHECK-LABEL: test_4xi64_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $14, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %res = select <4 x i1> , <4 x i64> %shuf, <4 x i64> %vec3 @@ -1562,10 +1543,9 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: movb $14, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %res = select <4 x i1> , <4 x i64> %shuf, <4 x i64> zeroinitializer @@ -1583,10 +1563,10 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $2, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1597,10 +1577,9 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $2, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1611,10 +1590,10 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $14, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1625,10 +1604,9 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $14, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1639,10 +1617,10 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $8, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1653,10 +1631,9 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] ; CHECK-NEXT: movb $8, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1676,10 +1653,10 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) { ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $10, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> @@ -1690,10 +1667,9 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) { ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] ; CHECK-NEXT: movb $10, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] ; CHECK-NEXT: retq %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> Index: test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -4,6 +4,235 @@ ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c +define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_shuffle_f32x4: +; X32: # BB#0: # %entry +; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_shuffle_f32x4: +; X64: # BB#0: # %entry +; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_mask_shuffle_f32x4: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_shuffle_f32x4: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> + %0 = bitcast i8 %__U to <8 x i1> + %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W + ret <8 x float> %1 +} + +define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_maskz_shuffle_f32x4: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_shuffle_f32x4: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> + %0 = bitcast i8 %__U to <8 x i1> + %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer + ret <8 x float> %1 +} + +define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_shuffle_f64x2: +; X32: # BB#0: # %entry +; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_shuffle_f64x2: +; X64: # BB#0: # %entry +; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> + ret <4 x double> %shuffle +} + +define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_mask_shuffle_f64x2: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_shuffle_f64x2: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> + %0 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> + %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W + ret <4 x double> %1 +} + +define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_maskz_shuffle_f64x2: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_shuffle_f64x2: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> + %0 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> + %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer + ret <4 x double> %1 +} + +define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_shuffle_i32x4: +; X32: # BB#0: # %entry +; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_shuffle_i32x4: +; X64: # BB#0: # %entry +; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> + ret <4 x i64> %shuffle +} + +define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_mask_shuffle_i32x4: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_shuffle_i32x4: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> + %0 = bitcast <4 x i64> %shuffle to <8 x i32> + %1 = bitcast <4 x i64> %__W to <8 x i32> + %2 = bitcast i8 %__U to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 + %4 = bitcast <8 x i32> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_maskz_shuffle_i32x4: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_shuffle_i32x4: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> + %0 = bitcast <4 x i64> %shuffle to <8 x i32> + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer + %3 = bitcast <8 x i32> %2 to <4 x i64> + ret <4 x i64> %3 +} + +define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_shuffle_i64x2: +; X32: # BB#0: # %entry +; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_shuffle_i64x2: +; X64: # BB#0: # %entry +; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> + ret <4 x i64> %shuffle +} + +define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_mask_shuffle_i64x2: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_shuffle_i64x2: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> + %0 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> + %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W + ret <4 x i64> %1 +} + +define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 { +; X32-LABEL: test_mm256_maskz_shuffle_i64x2: +; X32: # BB#0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_shuffle_i64x2: +; X64: # BB#0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] +; X64-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> + %0 = bitcast i8 %__U to <8 x i1> + %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> + %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer + ret <4 x i64> %1 +} + define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) { ; X32-LABEL: test_mm_mask_set1_epi32: ; X32: # BB#0: # %entry Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -2724,15 +2724,13 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256: ; CHECK: ## BB#0: +; CHECK-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c] +; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd9,0x16] -; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16] -; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: vmovaps %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0xd0] +; CHECK-NEXT: vmovaps %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc8] ; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) @@ -2747,15 +2745,13 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd9,0x16] -; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3] -; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3] -; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16] +; CHECK-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c] ; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vmovapd %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0xd0] +; CHECK-NEXT: vmovapd %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0xc8] ; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1) @@ -2770,11 +2766,10 @@ define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xd1,0x16] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc1,0x16] +; CHECK-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0] ; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xd0] ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) @@ -2788,11 +2783,10 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256: ; CHECK: ## BB#0: +; CHECK-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0] +; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xd1,0x16] -; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3] -; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc1,0x16] -; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3] +; CHECK-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xd0] ; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)