Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -1829,30 +1829,6 @@ def int_x86_avx2_vbroadcast_ss_ps_256 : GCCBuiltin<"__builtin_ia32_vbroadcastss_ps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastb_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastb_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastw_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastw_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v8i16_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastd_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastd128">, - Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastd_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastd256">, - Intrinsic<[llvm_v8i32_ty], [llvm_v4i32_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastq_128 : - GCCBuiltin<"__builtin_ia32_pbroadcastq128">, - Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; - def int_x86_avx2_pbroadcastq_256 : - GCCBuiltin<"__builtin_ia32_pbroadcastq256">, - Intrinsic<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_mask_pbroadcast_d_gpr_512 : GCCBuiltin<"__builtin_ia32_pbroadcastd512_gpr_mask">, Intrinsic<[llvm_v16i32_ty], [llvm_i32_ty, llvm_v16i32_ty, Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -129,6 +129,7 @@ Name.startswith("x86.sse2.pcmpgt.") || Name.startswith("x86.avx2.pcmpeq.") || Name.startswith("x86.avx2.pcmpgt.") || + Name.startswith("x86.avx2.pbroadcast") || Name.startswith("x86.avx.vpermil.") || Name == "x86.avx.vinsertf128.pd.256" || Name == "x86.avx.vinsertf128.ps.256" || @@ -446,6 +447,13 @@ const int Idxs[4] = { 0, 1, 0, 1 }; Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()), Idxs); + } else if (Name.startswith("llvm.x86.avx2.pbroadcast")) { + // Replace vpbroadcasts with a vector shuffle. + Value *Op = CI->getArgOperand(0); + unsigned NumElts = CI->getType()->getVectorNumElements(); + Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts); + Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()), + Constant::getNullValue(MaskTy)); } else if (Name == "llvm.x86.sse2.psll.dq") { // 128-bit shift left specified in bits. unsigned Shift = cast(CI->getArgOperand(1))->getZExtValue(); Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -8303,74 +8303,31 @@ // multiclass avx2_broadcast opc, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag, - Intrinsic Int128, Intrinsic Int256> { + ValueType OpVT128, ValueType OpVT256> { def rr : AVX28I, + [(set VR128:$dst, (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, Sched<[WriteShuffle]>, VEX; def rm : AVX28I, + [(set VR128:$dst, (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, Sched<[WriteLoad]>, VEX; def Yrr : AVX28I, + [(set VR256:$dst, (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, Sched<[WriteShuffle256]>, VEX, VEX_L; def Yrm : AVX28I, + [(set VR256:$dst, (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, Sched<[WriteLoad]>, VEX, VEX_L; } -defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, - int_x86_avx2_pbroadcastb_128, - int_x86_avx2_pbroadcastb_256>; -defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, - int_x86_avx2_pbroadcastw_128, - int_x86_avx2_pbroadcastw_256>; -defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, - int_x86_avx2_pbroadcastd_128, - int_x86_avx2_pbroadcastd_256>; -defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, - int_x86_avx2_pbroadcastq_128, - int_x86_avx2_pbroadcastq_256>; +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, v16i8, v32i8>; +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, v8i16, v16i16>; +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, v4i32, v8i32>; +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, v2i64, v4i64>; let Predicates = [HasAVX2] in { - def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))), - (VPBROADCASTBrm addr:$src)>; - def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))), - (VPBROADCASTBYrm addr:$src)>; - def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWrm addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWYrm addr:$src)>; - def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDrm addr:$src)>; - def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDYrm addr:$src)>; - def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), - (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), - (VPBROADCASTQYrm addr:$src)>; - - def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))), - (VPBROADCASTBrr VR128:$src)>; - def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))), - (VPBROADCASTBYrr VR128:$src)>; - def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))), - (VPBROADCASTWrr VR128:$src)>; - def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))), - (VPBROADCASTWYrr VR128:$src)>; - def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))), - (VPBROADCASTDrr VR128:$src)>; - def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))), - (VPBROADCASTDYrr VR128:$src)>; - def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))), - (VPBROADCASTQrr VR128:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))), - (VPBROADCASTQYrr VR128:$src)>; def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))), (VBROADCASTSSrr VR128:$src)>; def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))), Index: test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -111,3 +111,90 @@ ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind readnone + +define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastb_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly + + +define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastb_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 +; CHECK-NEXT: retl + %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0) + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly + + +define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastw_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0) + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly + + +define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastw_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 +; CHECK-NEXT: retl + %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0) + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly + + +define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastd_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly + + +define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastd_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly + + +define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastq_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly + + +define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) { +; CHECK-LABEL: test_x86_avx2_pbroadcastq_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: retl + %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly Index: test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86.ll +++ test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -681,70 +681,6 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone -define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) { - ; CHECK: vpbroadcastb - %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly - - -define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) { - ; CHECK: vpbroadcastb - %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly - - -define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) { - ; CHECK: vpbroadcastw - %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly - - -define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) { - ; CHECK: vpbroadcastw - %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly - - -define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) { - ; CHECK: vbroadcastss - %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly - - -define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) { - ; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}} - %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly - - -define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) { - ; CHECK: vpbroadcastq - %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly - - -define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) { - ; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}} - %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly - - define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) { ; Check that the arguments are swapped between the intrinsic definition ; and its lowering. Indeed, the offsets are the first source in Index: test/CodeGen/X86/stack-folding-int-avx2.ll =================================================================== --- test/CodeGen/X86/stack-folding-int-avx2.ll +++ test/CodeGen/X86/stack-folding-int-avx2.ll @@ -286,81 +286,73 @@ ;CHECK-LABEL: stack_fold_pbroadcastb ;CHECK: vpbroadcastb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0) + %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %2 } -declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) { ;CHECK-LABEL: stack_fold_pbroadcastb_ymm ;CHECK: vpbroadcastb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0) + %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> zeroinitializer ret <32 x i8> %2 } -declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) { ;CHECK-LABEL: stack_fold_pbroadcastd ;CHECK: vpbroadcastd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) + %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer ; add forces execution domain %3 = add <4 x i32> %2, ret <4 x i32> %3 } -declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) { ;CHECK-LABEL: stack_fold_pbroadcastd_ymm ;CHECK: vpbroadcastd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) + %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> zeroinitializer ; add forces execution domain %3 = add <8 x i32> %2, ret <8 x i32> %3 } -declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) { ;CHECK-LABEL: stack_fold_pbroadcastq ;CHECK: vpbroadcastq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) + %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer ; add forces execution domain %3 = add <2 x i64> %2, ret <2 x i64> %3 } -declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) { ;CHECK-LABEL: stack_fold_pbroadcastq_ymm ;CHECK: vpbroadcastq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) + %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer ; add forces execution domain %3 = add <4 x i64> %2, ret <4 x i64> %3 } -declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) { ;CHECK-LABEL: stack_fold_pbroadcastw ;CHECK: vpbroadcastw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0) + %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %2 } -declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) { ;CHECK-LABEL: stack_fold_pbroadcastw_ymm ;CHECK: vpbroadcastw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0) + %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> zeroinitializer ret <16 x i16> %2 } -declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) { ;CHECK-LABEL: stack_fold_pcmpeqb