Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -1615,6 +1615,24 @@ Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_movddup_128 : + GCCBuiltin<"__builtin_ia32_movddup128_mask">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], + [IntrNoMem]>; + + def int_x86_avx512_mask_movddup_256 : + GCCBuiltin<"__builtin_ia32_movddup256_mask">, + Intrinsic<[llvm_v4f64_ty], + [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty], + [IntrNoMem]>; + + def int_x86_avx512_mask_movddup_512 : + GCCBuiltin<"__builtin_ia32_movddup512_mask">, + Intrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], + [IntrNoMem]>; } // Vector blend Index: lib/Target/X86/InstPrinter/X86InstComments.cpp =================================================================== --- lib/Target/X86/InstPrinter/X86InstComments.cpp +++ lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -336,24 +336,15 @@ DecodeMOVSHDUPMask(VT, ShuffleMask); break; } - case X86::VMOVDDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VMOVDDUPYrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask); - break; - - case X86::MOVDDUPrr: - case X86::VMOVDDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVDDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::MOVDDUPrm: - case X86::VMOVDDUPrm: + CASE_MOVDUP(MOVDDUP, m) { + MVT VT = getRegOperandVectorVT(MI, MVT::f64, 0); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask); + DecodeMOVDDUPMask(VT, ShuffleMask); break; - + } case X86::PSLLDQri: case X86::VPSLLDQri: Src1Name = getRegName(MI->getOperand(1).getReg()); Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10841,6 +10841,11 @@ lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; + if (isSingleInputShuffleMask(Mask)) + // Use low duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1); + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } @@ -10881,6 +10886,11 @@ lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; + if (isSingleInputShuffleMask(Mask)) + // Use low duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8i64, V1); + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -4235,26 +4235,6 @@ defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>; //===----------------------------------------------------------------------===// -// AVX-512 - MOVDDUP -//===----------------------------------------------------------------------===// - -multiclass avx512_movddup { -def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; -def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, - (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; -} - -defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, loadv8f64>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), - (VMOVDDUPZrm addr:$src)>; - -//===----------------------------------------------------------------------===// // Move Low to High and High to Low packed FP Instructions //===----------------------------------------------------------------------===// def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst), @@ -7137,6 +7117,52 @@ defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>; defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>; + +//===----------------------------------------------------------------------===// +// AVX-512 - MOVDDUP +//===----------------------------------------------------------------------===// + +multiclass avx512_movddup_128 opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable, EVEX; + let mayLoad = 1 in + defm rm : AVX512_maskable, + EVEX, EVEX_CD8<_.EltSize, CD8VH>; +} + +multiclass avx512_movddup_common opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + + defm Z : avx512_unary_rm, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_unary_rm, + EVEX_V256; + defm Z128 : avx512_movddup_128, + EVEX_V128; + } +} + +multiclass avx512_movddup opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_movddup_common, XD, VEX_W; + let isCodeGenOnly = 1 in + defm NAME#_I: avx512_movddup_common; +} + +defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; + +def : Pat<(X86Movddup (loadv2f64 addr:$src)), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; +def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; + //===----------------------------------------------------------------------===// // AVX-512 - Unpack Instructions //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -5206,21 +5206,30 @@ def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (v4f64 (X86Movddup - (scalar_to_vector (loadf64 addr:$src)))))]>, + (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, Sched<[WriteLoad]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; } defm MOVDDUP : sse3_replicate_dfp<"movddup">; -let Predicates = [HasAVX] in { + +let Predicates = [HasAVX, NoVLX] in { def : Pat<(X86Movddup (loadv2f64 addr:$src)), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + + // 256-bit version + def : Pat<(X86Movddup (loadv4i64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; +} + +let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))), @@ -5228,16 +5237,6 @@ def : Pat<(X86Movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - - // 256-bit version - def : Pat<(X86Movddup (loadv4f64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (loadv4i64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 VR256:$src)), - (VMOVDDUPYrr VR256:$src)>; } let Predicates = [UseAVX, OptForSize] in { Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -792,6 +792,12 @@ X86ISD::FMIN, X86ISD::FMIN_RND), X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK, X86ISD::MOVSHDUP, 0), X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK, Index: test/CodeGen/X86/avx-isa-check.ll =================================================================== --- test/CodeGen/X86/avx-isa-check.ll +++ test/CodeGen/X86/avx-isa-check.ll @@ -406,3 +406,26 @@ ret void } +define <2 x double> @test39(double* %ptr) nounwind { + %a = load double, double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + ret <2 x double> %shuffle + } + +define <2 x double> @test40(<2 x double>* %ptr) nounwind { + %v = load <2 x double>, <2 x double>* %ptr + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + ret <2 x double> %shuffle + } + +define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) { + %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> + ret <2 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} + Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -4722,3 +4722,27 @@ ret <16 x float> %res4 } +declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovddup %zmm0, %zmm1 {%k1} +; CHECK-NEXT: ## zmm1 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovddup %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: ## zmm2 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovddup %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) + %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) + %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res2, %res3 + ret <8 x double> %res4 +} + Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -5430,4 +5430,50 @@ %res4 = fadd <8 x float> %res2, %res3 ret <8 x float> %res4 } +declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} +; CHECK-NEXT: ## xmm1 = xmm0[0,0] +; CHECK-NEXT: vmovddup %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: ## xmm2 = xmm0[0,0] +; CHECK-NEXT: vmovddup %xmm0, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) + %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) + %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} +; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2] +; CHECK-NEXT: vmovddup %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2] +; CHECK-NEXT: vmovddup %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) + %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1) + %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res2, %res3 + ret <4 x double> %res4 +} Index: test/CodeGen/X86/vector-shuffle-128-v2.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v2.ll +++ test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1461,8 +1461,7 @@ ; ; AVX512VL-LABEL: insert_dup_mem_v2f64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovsd (%rdi), %xmm0 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX512VL-NEXT: retq %a = load double, double* %ptr %v = insertelement <2 x double> undef, double %a, i32 0 @@ -1470,6 +1469,43 @@ ret <2 x double> %shuffle } +define <2 x double> @insert_dup_mem128_v2f64(<2 x double>* %ptr) nounwind { +; SSE2-LABEL: insert_dup_mem128_v2f64: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_dup_mem128_v2f64: +; SSE3: # BB#0: +; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_dup_mem128_v2f64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_mem128_v2f64: +; SSE41: # BB#0: +; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_mem128_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem128_v2f64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX512VL-NEXT: retq + %v = load <2 x double>, <2 x double>* %ptr + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + ret <2 x double> %shuffle +} + + define <2 x i64> @insert_dup_mem_v2i64(i64* %ptr) { ; SSE-LABEL: insert_dup_mem_v2i64: ; SSE: # BB#0: Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -15,6 +15,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0000: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -30,6 +35,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0001: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -47,6 +57,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0020: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -63,6 +78,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0300: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -79,6 +99,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_1000: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -94,6 +119,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_2200: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -110,6 +140,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_3330: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -125,6 +160,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_3210: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -134,11 +174,12 @@ ; ALL: # BB#0: ; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] ; ALL-NEXT: retq - +; ; AVX512VL-LABEL: shuffle_v4f64_0023: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpermilpd $8, %ymm0, %ymm0 ; AVX512VL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -148,6 +189,26 @@ ; ALL: # BB#0: ; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0022: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX512VL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64mem_0022(<4 x double>* %ptr, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64mem_0022: +; ALL: # BB#0: +; ALL-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2] +; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64mem_0022: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2] +; AVX512VL-NEXT: retq + %a = load <4 x double>, <4 x double>* %ptr %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -157,6 +218,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_1032: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermilpd $5, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -166,6 +232,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_1133: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermilpd $15, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -175,6 +246,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_1023: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermilpd $9, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -184,6 +260,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_1022: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermilpd $1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -200,6 +281,12 @@ ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0423: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -211,6 +298,13 @@ ; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0462: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX512VL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -220,6 +314,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0426: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vunpcklpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -229,6 +328,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_1537: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vunpckhpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -238,6 +342,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_4062: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vunpcklpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -247,6 +356,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_5173: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vunpckhpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -256,6 +370,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_5163: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vshufpd $11, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -265,6 +384,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0527: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -274,6 +398,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_4163: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -283,6 +412,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0145: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -292,6 +426,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_4501: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -301,6 +440,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0167: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -311,6 +455,12 @@ ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_1054: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermilpd $5, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -321,6 +471,12 @@ ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_3254: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-NEXT: vpermilpd $5, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -331,6 +487,12 @@ ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_3276: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpermilpd $5, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -341,6 +503,12 @@ ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_1076: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX512VL-NEXT: vpermilpd $5, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -359,6 +527,13 @@ ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_0415: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -368,6 +543,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4f64_u062: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vunpcklpd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -383,6 +563,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0000: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -398,6 +583,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0001: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -415,6 +605,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0020: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -431,6 +626,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0112: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -447,6 +647,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0300: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -463,6 +668,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_1000: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -478,6 +688,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_2200: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -494,6 +709,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_3330: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -509,6 +729,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_3210: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -526,6 +751,12 @@ ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0124: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -544,6 +775,13 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0142: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -564,6 +802,13 @@ ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0412: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -583,6 +828,12 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_4012: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -592,6 +843,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0145: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -610,6 +866,13 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0451: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -619,6 +882,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_4501: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -637,6 +905,13 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_4015: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -654,6 +929,12 @@ ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,1] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_2u35: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,1] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -674,6 +955,13 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_1251: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -690,6 +978,12 @@ ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_1054: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -706,6 +1000,12 @@ ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_3254: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -722,6 +1022,12 @@ ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_3276: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -738,6 +1044,12 @@ ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_1076: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -756,6 +1068,13 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_0415: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -771,6 +1090,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_z4z6: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23] +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> ret <4 x i64> %shuffle } @@ -786,6 +1110,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_5zuz: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> ret <4 x i64> %shuffle } @@ -800,6 +1129,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_40u2: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpunpcklqdq %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } @@ -807,6 +1141,9 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; ALL-LABEL: stress_test1: ; ALL: retq +; +; AVX512VL-LABEL: stress_test1: +; AVX512VL: retq %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> @@ -820,6 +1157,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vmovq %rdi, %xmm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: insert_reg_and_zero_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovq %rdi, %xmm0 +; AVX512VL-NEXT: retq %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> ret <4 x i64> %shuffle @@ -830,6 +1172,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: retq +; +; AVX512VL-LABEL: insert_mem_and_zero_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovq (%rdi), %xmm0 +; AVX512VL-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> @@ -842,6 +1189,12 @@ ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; ALL-NEXT: retq +; +; AVX512VL-LABEL: insert_reg_and_zero_v4f64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: retq %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> ret <4 x double> %shuffle @@ -852,6 +1205,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: retq +; +; AVX512VL-LABEL: insert_mem_and_zero_v4f64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovsd (%rdi), %xmm0 +; AVX512VL-NEXT: retq %a = load double, double* %ptr %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> @@ -863,6 +1221,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: splat_mem_v4f64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX512VL-NEXT: retq %a = load double, double* %ptr %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> @@ -874,6 +1237,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: splat_mem_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512VL-NEXT: retq %a = load i64, i64* %ptr %v = insertelement <4 x i64> undef, i64 %a, i64 0 %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> @@ -885,6 +1253,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: splat_mem_v4f64_2: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX512VL-NEXT: retq %1 = load double, double* %p %2 = insertelement <2 x double> undef, double %1, i32 0 %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer @@ -902,6 +1275,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: splat_v4f64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512VL-NEXT: retq %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer ret <4 x double> %1 } @@ -917,6 +1295,12 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: splat_mem_v4i64_from_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa64 (%rdi), %xmm0 +; AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512VL-NEXT: retq %v = load <2 x i64>, <2 x i64>* %ptr %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> ret <4 x i64> %shuffle @@ -933,6 +1317,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: splat_mem_v4f64_from_v2f64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX512VL-NEXT: retq %v = load <2 x double>, <2 x double>* %ptr %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> ret <4 x double> %shuffle @@ -944,6 +1333,12 @@ ; ALL-NEXT: vmovaps (%rdi), %xmm0 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: splat128_mem_v4i64_from_v2i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa64 (%rdi), %xmm0 +; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %v = load <2 x i64>, <2 x i64>* %ptr %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> ret <4 x i64> %shuffle @@ -955,6 +1350,12 @@ ; ALL-NEXT: vmovaps (%rdi), %xmm0 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: splat128_mem_v4f64_from_v2f64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovapd (%rdi), %xmm0 +; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %v = load <2 x double>, <2 x double>* %ptr %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> ret <4 x double> %shuffle @@ -970,6 +1371,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: bitcast_v4f64_0426: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float> %shuffle32 = shufflevector <8 x float> %bitcast32, <8 x float> undef, <8 x i32> @@ -989,6 +1395,11 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq +; +; AVX512VL-LABEL: concat_v4i64_0167: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-NEXT: retq %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> %a1hi = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> %shuffle64 = shufflevector <2 x i64> %a0lo, <2 x i64> %a1hi, <4 x i32> @@ -1000,6 +1411,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: concat_v4i64_0145_bc: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: retq %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> %a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> %bc0lo = bitcast <2 x i64> %a0lo to <4 x i32> @@ -1014,6 +1430,11 @@ ; ALL: # BB#0: ; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 ; ALL-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem_v4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512VL-NEXT: retq %tmp = load i64, i64* %ptr, align 1 %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0 %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <4 x i32> zeroinitializer Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -538,15 +538,29 @@ ; ; AVX512F-LABEL: shuffle_v8f64_00224466: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,4,4,6,6] -; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_00224466: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,4,0,4,0,6,0,6,0] -; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64mem_00224466(<8 x double>* %ptr, <8 x double> %b) { +; AVX512F-LABEL: shuffle_v8f64mem_00224466: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8f64mem_00224466: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] ; AVX512F-32-NEXT: retl + %a = load <8 x double>, <8 x double>* %ptr %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle } @@ -1524,15 +1538,29 @@ ; ; AVX512F-LABEL: shuffle_v8i64_00224466: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,4,4,6,6] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovddup %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_00224466: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,4,0,4,0,6,0,6,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovddup %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64mem_00224466(<8 x i64>* %ptr, <8 x i64> %b) { +; AVX512F-LABEL: shuffle_v8i64mem_00224466: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovddup (%rdi), %zmm0 +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64mem_00224466: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vmovddup (%eax), %zmm0 ; AVX512F-32-NEXT: retl + %a = load <8 x i64>, <8 x i64>* %ptr %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle } @@ -2101,7 +2129,7 @@ ; AVX512F-32-LABEL: test_vshuff64x2_512_maskz: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-32-NEXT: vpandq .LCPI122_0, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpandq .LCPI124_0, %zmm2, %zmm2 ; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] ; AVX512F-32-NEXT: retl @@ -2122,7 +2150,7 @@ ; AVX512F-32-LABEL: test_vshufi64x2_512_mask: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2 -; AVX512F-32-NEXT: vpandq .LCPI123_0, %zmm2, %zmm2 +; AVX512F-32-NEXT: vpandq .LCPI125_0, %zmm2, %zmm2 ; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1] ; AVX512F-32-NEXT: retl @@ -2159,7 +2187,7 @@ ; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-32-NEXT: vpandq .LCPI125_0, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpandq .LCPI127_0, %zmm1, %zmm1 ; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] @@ -2182,7 +2210,7 @@ ; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1 -; AVX512F-32-NEXT: vpandq .LCPI126_0, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpandq .LCPI128_0, %zmm1, %zmm1 ; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1] Index: test/MC/X86/avx512-encodings.s =================================================================== --- test/MC/X86/avx512-encodings.s +++ test/MC/X86/avx512-encodings.s @@ -18561,3 +18561,39 @@ // CHECK: encoding: [0x62,0x61,0xfd,0x08,0x17,0x8a,0xf8,0xfb,0xff,0xff] vmovhpd %xmm25, -1032(%rdx) +// CHECK: vmovddup %zmm29, %zmm5 +// CHECK: encoding: [0x62,0x91,0xff,0x48,0x12,0xed] + vmovddup %zmm29, %zmm5 + +// CHECK: vmovddup %zmm29, %zmm5 {%k4} +// CHECK: encoding: [0x62,0x91,0xff,0x4c,0x12,0xed] + vmovddup %zmm29, %zmm5 {%k4} + +// CHECK: vmovddup %zmm29, %zmm5 {%k4} {z} +// CHECK: encoding: [0x62,0x91,0xff,0xcc,0x12,0xed] + vmovddup %zmm29, %zmm5 {%k4} {z} + +// CHECK: vmovddup (%rcx), %zmm5 +// CHECK: encoding: [0x62,0xf1,0xff,0x48,0x12,0x29] + vmovddup (%rcx), %zmm5 + +// CHECK: vmovddup 291(%rax,%r14,8), %zmm5 +// CHECK: encoding: [0x62,0xb1,0xff,0x48,0x12,0xac,0xf0,0x23,0x01,0x00,0x00] + vmovddup 291(%rax,%r14,8), %zmm5 + +// CHECK: vmovddup 8128(%rdx), %zmm5 +// CHECK: encoding: [0x62,0xf1,0xff,0x48,0x12,0x6a,0x7f] + vmovddup 8128(%rdx), %zmm5 + +// CHECK: vmovddup 8192(%rdx), %zmm5 +// CHECK: encoding: [0x62,0xf1,0xff,0x48,0x12,0xaa,0x00,0x20,0x00,0x00] + vmovddup 8192(%rdx), %zmm5 + +// CHECK: vmovddup -8192(%rdx), %zmm5 +// CHECK: encoding: [0x62,0xf1,0xff,0x48,0x12,0x6a,0x80] + vmovddup -8192(%rdx), %zmm5 + +// CHECK: vmovddup -8256(%rdx), %zmm5 +// CHECK: encoding: [0x62,0xf1,0xff,0x48,0x12,0xaa,0xc0,0xdf,0xff,0xff] + vmovddup -8256(%rdx), %zmm5 + Index: test/MC/X86/x86-64-avx512f_vl.s =================================================================== --- test/MC/X86/x86-64-avx512f_vl.s +++ test/MC/X86/x86-64-avx512f_vl.s @@ -22123,3 +22123,75 @@ // CHECK: encoding: [0x62,0x61,0x7e,0x28,0x12,0x82,0xe0,0xef,0xff,0xff] vmovsldup -4128(%rdx), %ymm24 +// CHECK: vmovddup %xmm23, %xmm17 +// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x12,0xcf] + vmovddup %xmm23, %xmm17 + +// CHECK: vmovddup %xmm23, %xmm17 {%k6} +// CHECK: encoding: [0x62,0xa1,0xff,0x0e,0x12,0xcf] + vmovddup %xmm23, %xmm17 {%k6} + +// CHECK: vmovddup %xmm23, %xmm17 {%k6} {z} +// CHECK: encoding: [0x62,0xa1,0xff,0x8e,0x12,0xcf] + vmovddup %xmm23, %xmm17 {%k6} {z} + +// CHECK: vmovddup (%rcx), %xmm17 +// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x12,0x09] + vmovddup (%rcx), %xmm17 + +// CHECK: vmovddup 291(%rax,%r14,8), %xmm17 +// CHECK: encoding: [0x62,0xa1,0xff,0x08,0x12,0x8c,0xf0,0x23,0x01,0x00,0x00] + vmovddup 291(%rax,%r14,8), %xmm17 + +// CHECK: vmovddup 1016(%rdx), %xmm17 +// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x12,0x4a,0x7f] + vmovddup 1016(%rdx), %xmm17 + +// CHECK: vmovddup 1024(%rdx), %xmm17 +// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x12,0x8a,0x00,0x04,0x00,0x00] + vmovddup 1024(%rdx), %xmm17 + +// CHECK: vmovddup -1024(%rdx), %xmm17 +// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x12,0x4a,0x80] + vmovddup -1024(%rdx), %xmm17 + +// CHECK: vmovddup -1032(%rdx), %xmm17 +// CHECK: encoding: [0x62,0xe1,0xff,0x08,0x12,0x8a,0xf8,0xfb,0xff,0xff] + vmovddup -1032(%rdx), %xmm17 + +// CHECK: vmovddup %ymm25, %ymm18 +// CHECK: encoding: [0x62,0x81,0xff,0x28,0x12,0xd1] + vmovddup %ymm25, %ymm18 + +// CHECK: vmovddup %ymm25, %ymm18 {%k4} +// CHECK: encoding: [0x62,0x81,0xff,0x2c,0x12,0xd1] + vmovddup %ymm25, %ymm18 {%k4} + +// CHECK: vmovddup %ymm25, %ymm18 {%k4} {z} +// CHECK: encoding: [0x62,0x81,0xff,0xac,0x12,0xd1] + vmovddup %ymm25, %ymm18 {%k4} {z} + +// CHECK: vmovddup (%rcx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0xff,0x28,0x12,0x11] + vmovddup (%rcx), %ymm18 + +// CHECK: vmovddup 291(%rax,%r14,8), %ymm18 +// CHECK: encoding: [0x62,0xa1,0xff,0x28,0x12,0x94,0xf0,0x23,0x01,0x00,0x00] + vmovddup 291(%rax,%r14,8), %ymm18 + +// CHECK: vmovddup 4064(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0xff,0x28,0x12,0x52,0x7f] + vmovddup 4064(%rdx), %ymm18 + +// CHECK: vmovddup 4096(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0xff,0x28,0x12,0x92,0x00,0x10,0x00,0x00] + vmovddup 4096(%rdx), %ymm18 + +// CHECK: vmovddup -4096(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0xff,0x28,0x12,0x52,0x80] + vmovddup -4096(%rdx), %ymm18 + +// CHECK: vmovddup -4128(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0xff,0x28,0x12,0x92,0xe0,0xef,0xff,0xff] + vmovddup -4128(%rdx), %ymm18 +