diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18960,6 +18960,20 @@ } } + // If we are not inserting into the low 128-bit subvector, + // then prefer the broadcast+blend sequence. + // TODO: It is worthwhile to cast integer to floating point and back + // and incur a domain crossing penalty? + if (IdxVal * EltSizeInBits >= 128 && + ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || + Subtarget.hasAVX2())) { + SDValue N1SplatVec = DAG.getNode(ISD::SPLAT_VECTOR, dl, VT, N1); + SmallVector BlendMask; + for (unsigned i = 0; i != NumElts; ++i) + BlendMask.push_back(i == IdxVal ? i + NumElts : i); + return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask); + } + // Get the desired 128-bit vector chunk. SDValue V = extract128BitVector(N0, IdxVal, DAG, dl); diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll --- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll +++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll @@ -503,18 +503,15 @@ ; NOGATHER-NEXT: je .LBB7_10 ; NOGATHER-NEXT: # %bb.9: # %cond.load10 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; NOGATHER-NEXT: .LBB7_10: # %else11 ; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB7_12 ; NOGATHER-NEXT: # %bb.11: # %cond.load13 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; NOGATHER-NEXT: .LBB7_12: # %else14 ; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $64, %al @@ -527,16 +524,14 @@ ; NOGATHER-NEXT: retq ; NOGATHER-NEXT: .LBB7_13: # %cond.load16 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; NOGATHER-NEXT: testb $-128, %al ; NOGATHER-NEXT: je .LBB7_16 ; NOGATHER-NEXT: .LBB7_15: # %cond.load19 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rax), %ymm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -667,16 +662,14 @@ ; NOGATHER-NEXT: retq ; NOGATHER-NEXT: .LBB9_5: # %cond.load4 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastsd (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; NOGATHER-NEXT: testb $8, %al ; NOGATHER-NEXT: je .LBB9_8 ; NOGATHER-NEXT: .LBB9_7: # %cond.load7 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastsd (%rax), %ymm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -8,9 +8,9 @@ ; CHECK: ## %bb.0: ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] ; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 +; CHECK-NEXT: vbroadcastss %xmm1, %zmm1 +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,15] +; CHECK-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: retq %rrr = load float, float* %br %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1 @@ -19,14 +19,23 @@ } define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { -; CHECK-LABEL: test2: -; CHECK: ## %bb.0: -; CHECK-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1] -; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: test2: +; KNL: ## %bb.0: +; KNL-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 +; KNL-NEXT: movb $64, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test2: +; SKX: ## %bb.0: +; SKX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 +; SKX-NEXT: movb $64, %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} +; SKX-NEXT: retq %rrr = load double, double* %br %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1 %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6 @@ -535,14 +544,23 @@ } define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) { -; CHECK-LABEL: insert_v8i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v8i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: movb $8, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v8i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: movb $8, %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <8 x i64> %x, i64 %val, i32 1 %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3 @@ -550,13 +568,22 @@ } define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { -; CHECK-LABEL: insert_v4i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v4i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vmovq %rdi, %xmm1 +; KNL-NEXT: vpbroadcastq %xmm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v4i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SKX-NEXT: vpbroadcastq %rdi, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <4 x i64> %x, i64 %val, i32 1 %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3 @@ -576,14 +603,23 @@ } define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) { -; CHECK-LABEL: insert_v16i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v16i32: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: movw $32, %ax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpbroadcastd %edi, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v16i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: movw $32, %ax +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vpbroadcastd %edi, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <16 x i32> %x, i32 %val, i32 1 %r2 = insertelement <16 x i32> %r1, i32 %y, i32 5 @@ -591,13 +627,22 @@ } define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) { -; CHECK-LABEL: insert_v8i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v8i32: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastd %xmm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v8i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SKX-NEXT: vpbroadcastd %edi, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <8 x i32> %x, i32 %val, i32 1 %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5 @@ -617,14 +662,24 @@ } define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) { -; CHECK-LABEL: insert_v32i16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v32i16: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 +; KNL-NEXT: vmovd %edi, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; KNL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v32i16: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: movl $512, %eax ## imm = 0x200 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vpbroadcastw %edi, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <32 x i16> %x, i16 %val, i32 1 %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9 @@ -632,13 +687,24 @@ } define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) { -; CHECK-LABEL: insert_v16i16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v16i16: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastw %xmm1, %ymm1 +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v16i16: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SKX-NEXT: vpbroadcastw %edi, %ymm2 +; SKX-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,25,10,11,12,13,14,15] +; SKX-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 +; SKX-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <16 x i16> %x, i16 %val, i32 1 %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9 @@ -658,14 +724,15 @@ } define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) { -; CHECK-LABEL: insert_v64i8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v64i8: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 +; KNL-NEXT: vmovd %edi, %xmm0 +; KNL-NEXT: vpbroadcastb %xmm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; KNL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; KNL-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <64 x i8> %x, i8 %val, i32 1 %r2 = insertelement <64 x i8> %r1, i8 %y, i32 50 @@ -673,13 +740,24 @@ } define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) { -; CHECK-LABEL: insert_v32i8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v32i8: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastb %xmm1, %ymm1 +; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; KNL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v32i8: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SKX-NEXT: movl $131072, %eax ## imm = 0x20000 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vpbroadcastb %edi, %ymm0 {%k1} +; SKX-NEXT: retq %val = load i8, i8* %ptr %r1 = insertelement <32 x i8> %x, i8 %val, i32 1 %r2 = insertelement <32 x i8> %r1, i8 %y, i32 17 @@ -739,23 +817,39 @@ } define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) { -; CHECK-LABEL: test_insert_128_v16i16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1 -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: test_insert_128_v16i16: +; KNL: ## %bb.0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastw %xmm1, %ymm1 +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: test_insert_128_v16i16: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastw %edi, %ymm1 +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,26,11,12,13,14,15] +; SKX-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; SKX-NEXT: retq %r = insertelement <16 x i16> %x, i16 %y, i32 10 ret <16 x i16> %r } define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) { -; CHECK-LABEL: test_insert_128_v32i8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: test_insert_128_v32i8: +; KNL: ## %bb.0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastb %xmm1, %ymm1 +; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; KNL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_insert_128_v32i8: +; SKX: ## %bb.0: +; SKX-NEXT: movl $1048576, %eax ## imm = 0x100000 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vpbroadcastb %edi, %ymm0 {%k1} +; SKX-NEXT: retq %r = insertelement <32 x i8> %x, i8 %y, i32 20 ret <32 x i8> %r } diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll --- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -72,12 +72,19 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { ; -; AVX512-LABEL: load_one_mask_bit_set5: -; AVX512: ## %bb.0: -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_one_mask_bit_set5: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: movb $-128, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} +; AVX512F-NEXT: retq +; +; SKX-LABEL: load_one_mask_bit_set5: +; SKX: ## %bb.0: +; SKX-NEXT: movb $-128, %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} +; SKX-NEXT: retq %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>, <8 x double> %val) ret <8 x double> %res } diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -459,22 +459,41 @@ ; AVX2-NEXT: movl $255, %eax ; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 ; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: -; AVX512: # %bb.0: -; AVX512-NEXT: movl $255, %eax -; AVX512-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movl $255, %eax +; AVX512F-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movl $255, %eax +; AVX512VL-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 +; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: movl $1073741824, %eax # imm = 0x40000000 +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512VL-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512VL-NEXT: retq %1 = insertelement <32 x i8> %a, i8 -1, i32 0 %2 = insertelement <32 x i8> %1, i8 -1, i32 15 %3 = insertelement <32 x i8> %2, i8 -1, i32 30 diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll --- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll +++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll @@ -30,19 +30,20 @@ define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind { ; X86_AVX256-LABEL: insert_subvector_512: ; X86_AVX256: # %bb.0: -; X86_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 -; X86_AVX256-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; X86_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; X86_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 +; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] +; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 +; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; X86_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_512: ; X64_AVX256: # %bb.0: ; X64_AVX256-NEXT: vmovd %edi, %xmm2 ; X64_AVX256-NEXT: vpinsrd $1, %esi, %xmm2, %xmm2 -; X64_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm3 -; X64_AVX256-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; X64_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X64_AVX256-NEXT: vmovq %xmm2, %rax +; X64_AVX256-NEXT: vmovq %rax, %xmm2 +; X64_AVX256-NEXT: vpbroadcastq %xmm2, %ymm2 +; X64_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; X64_AVX256-NEXT: retq ; ; X86_AVX512-LABEL: insert_subvector_512: diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -216,16 +216,14 @@ ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB1_6 ; AVX1-NEXT: LBB1_5: ## %cond.load5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB1_8 ; AVX1-NEXT: LBB1_7: ## %cond.load9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovhps (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v4f64_v4i64: @@ -259,16 +257,14 @@ ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB1_6 ; AVX2-NEXT: LBB1_5: ## %cond.load5 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB1_8 ; AVX2-NEXT: LBB1_7: ## %cond.load9 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovhpd (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v4f64_v4i64: @@ -405,16 +401,14 @@ ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB2_6 ; AVX1-NEXT: LBB2_5: ## %cond.load5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB2_8 ; AVX1-NEXT: LBB2_7: ## %cond.load9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB2_10 @@ -431,16 +425,14 @@ ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB2_14 ; AVX1-NEXT: LBB2_13: ## %cond.load21 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je LBB2_16 ; AVX1-NEXT: LBB2_15: ## %cond.load25 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v8f64_v8i1: @@ -486,16 +478,14 @@ ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB2_6 ; AVX2-NEXT: LBB2_5: ## %cond.load5 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB2_8 ; AVX2-NEXT: LBB2_7: ## %cond.load9 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB2_10 @@ -512,16 +502,14 @@ ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB2_14 ; AVX2-NEXT: LBB2_13: ## %cond.load21 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB2_16 ; AVX2-NEXT: LBB2_15: ## %cond.load25 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v8f64_v8i1: @@ -777,16 +765,14 @@ ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB3_6 ; AVX1-NEXT: LBB3_5: ## %cond.load5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB3_8 ; AVX1-NEXT: LBB3_7: ## %cond.load9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB3_10 @@ -803,16 +789,14 @@ ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB3_14 ; AVX1-NEXT: LBB3_13: ## %cond.load21 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je LBB3_16 ; AVX1-NEXT: LBB3_15: ## %cond.load25 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1-NEXT: je LBB3_18 @@ -829,16 +813,14 @@ ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX1-NEXT: je LBB3_22 ; AVX1-NEXT: LBB3_21: ## %cond.load37 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX1-NEXT: je LBB3_24 ; AVX1-NEXT: LBB3_23: ## %cond.load41 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1-NEXT: je LBB3_26 @@ -855,16 +837,14 @@ ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1-NEXT: je LBB3_30 ; AVX1-NEXT: LBB3_29: ## %cond.load53 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX1-NEXT: je LBB3_32 ; AVX1-NEXT: LBB3_31: ## %cond.load57 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v16f64_v16i32: @@ -939,16 +919,14 @@ ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB3_6 ; AVX2-NEXT: LBB3_5: ## %cond.load5 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB3_8 ; AVX2-NEXT: LBB3_7: ## %cond.load9 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB3_10 @@ -965,16 +943,14 @@ ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB3_14 ; AVX2-NEXT: LBB3_13: ## %cond.load21 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB3_16 ; AVX2-NEXT: LBB3_15: ## %cond.load25 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB3_18 @@ -991,16 +967,14 @@ ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX2-NEXT: je LBB3_22 ; AVX2-NEXT: LBB3_21: ## %cond.load37 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX2-NEXT: je LBB3_24 ; AVX2-NEXT: LBB3_23: ## %cond.load41 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB3_26 @@ -1017,16 +991,14 @@ ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB3_30 ; AVX2-NEXT: LBB3_29: ## %cond.load53 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB3_32 ; AVX2-NEXT: LBB3_31: ## %cond.load57 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v16f64_v16i32: @@ -2193,31 +2165,26 @@ ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB8_10 ; AVX1-NEXT: LBB8_9: ## %cond.load13 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je LBB8_12 ; AVX1-NEXT: LBB8_11: ## %cond.load17 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB8_14 ; AVX1-NEXT: LBB8_13: ## %cond.load21 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je LBB8_16 ; AVX1-NEXT: LBB8_15: ## %cond.load25 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1-NEXT: je LBB8_18 @@ -2246,31 +2213,26 @@ ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1-NEXT: je LBB8_26 ; AVX1-NEXT: LBB8_25: ## %cond.load45 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX1-NEXT: je LBB8_28 ; AVX1-NEXT: LBB8_27: ## %cond.load49 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1-NEXT: je LBB8_30 ; AVX1-NEXT: LBB8_29: ## %cond.load53 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX1-NEXT: je LBB8_32 ; AVX1-NEXT: LBB8_31: ## %cond.load57 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX1-NEXT: je LBB8_34 @@ -2299,31 +2261,26 @@ ; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX1-NEXT: je LBB8_42 ; AVX1-NEXT: LBB8_41: ## %cond.load77 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX1-NEXT: je LBB8_44 ; AVX1-NEXT: LBB8_43: ## %cond.load81 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX1-NEXT: je LBB8_46 ; AVX1-NEXT: LBB8_45: ## %cond.load85 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX1-NEXT: je LBB8_48 ; AVX1-NEXT: LBB8_47: ## %cond.load89 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX1-NEXT: je LBB8_50 @@ -2352,31 +2309,26 @@ ; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX1-NEXT: je LBB8_58 ; AVX1-NEXT: LBB8_57: ## %cond.load109 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX1-NEXT: je LBB8_60 ; AVX1-NEXT: LBB8_59: ## %cond.load113 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX1-NEXT: je LBB8_62 ; AVX1-NEXT: LBB8_61: ## %cond.load117 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX1-NEXT: je LBB8_64 ; AVX1-NEXT: LBB8_63: ## %cond.load121 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v32f32_v32i32: @@ -2515,31 +2467,26 @@ ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB8_10 ; AVX2-NEXT: LBB8_9: ## %cond.load13 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je LBB8_12 ; AVX2-NEXT: LBB8_11: ## %cond.load17 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB8_14 ; AVX2-NEXT: LBB8_13: ## %cond.load21 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB8_16 ; AVX2-NEXT: LBB8_15: ## %cond.load25 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB8_18 @@ -2568,31 +2515,26 @@ ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB8_26 ; AVX2-NEXT: LBB8_25: ## %cond.load45 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX2-NEXT: je LBB8_28 ; AVX2-NEXT: LBB8_27: ## %cond.load49 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB8_30 ; AVX2-NEXT: LBB8_29: ## %cond.load53 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB8_32 ; AVX2-NEXT: LBB8_31: ## %cond.load57 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX2-NEXT: je LBB8_34 @@ -2621,31 +2563,26 @@ ; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX2-NEXT: je LBB8_42 ; AVX2-NEXT: LBB8_41: ## %cond.load77 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX2-NEXT: je LBB8_44 ; AVX2-NEXT: LBB8_43: ## %cond.load81 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX2-NEXT: je LBB8_46 ; AVX2-NEXT: LBB8_45: ## %cond.load85 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX2-NEXT: je LBB8_48 ; AVX2-NEXT: LBB8_47: ## %cond.load89 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX2-NEXT: je LBB8_50 @@ -2674,31 +2611,26 @@ ; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX2-NEXT: je LBB8_58 ; AVX2-NEXT: LBB8_57: ## %cond.load109 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX2-NEXT: je LBB8_60 ; AVX2-NEXT: LBB8_59: ## %cond.load113 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX2-NEXT: je LBB8_62 ; AVX2-NEXT: LBB8_61: ## %cond.load117 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX2-NEXT: je LBB8_64 ; AVX2-NEXT: LBB8_63: ## %cond.load121 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: retq ; ; AVX512-LABEL: expandload_v32f32_v32i32: diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -1467,9 +1467,8 @@ ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB4_16 ; AVX2-NEXT: .LBB4_15: # %cond.load19 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $3, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: .LBB4_16: # %else20 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2 @@ -1499,9 +1498,8 @@ ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB4_32 ; AVX2-NEXT: .LBB4_31: # %cond.load58 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $3, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: .LBB4_32: # %else61 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm0, %ymm0 @@ -1528,17 +1526,15 @@ ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB4_46 ; AVX2-NEXT: .LBB4_45: # %cond.load94 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7] ; AVX2-NEXT: .LBB4_46: # %else97 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB4_48 ; AVX2-NEXT: # %bb.47: # %cond.load99 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: .LBB4_48: # %else102 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -1562,21 +1558,18 @@ ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB4_10 ; AVX2-NEXT: .LBB4_9: # %cond.load10 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $0, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB4_12 ; AVX2-NEXT: .LBB4_11: # %cond.load13 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $1, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB4_14 ; AVX2-NEXT: .LBB4_13: # %cond.load16 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $2, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: jne .LBB4_15 ; AVX2-NEXT: jmp .LBB4_16 @@ -1600,21 +1593,18 @@ ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB4_26 ; AVX2-NEXT: .LBB4_25: # %cond.load43 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $0, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB4_28 ; AVX2-NEXT: .LBB4_27: # %cond.load48 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB4_30 ; AVX2-NEXT: .LBB4_29: # %cond.load53 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: jne .LBB4_31 ; AVX2-NEXT: jmp .LBB4_32 @@ -1638,15 +1628,13 @@ ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB4_42 ; AVX2-NEXT: .LBB4_41: # %cond.load84 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpinsrd $0, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7] ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB4_44 ; AVX2-NEXT: .LBB4_43: # %cond.load89 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: jne .LBB4_45 ; AVX2-NEXT: jmp .LBB4_46 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -3463,51 +3463,51 @@ ; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB22_18 ; AVX2-NEXT: LBB22_17: ## %cond.load22 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 16(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $512, %eax ## imm = 0x200 ; AVX2-NEXT: je LBB22_20 ; AVX2-NEXT: LBB22_19: ## %cond.load25 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 18(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX2-NEXT: je LBB22_22 ; AVX2-NEXT: LBB22_21: ## %cond.load28 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 20(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX2-NEXT: je LBB22_24 ; AVX2-NEXT: LBB22_23: ## %cond.load31 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 22(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB22_26 ; AVX2-NEXT: LBB22_25: ## %cond.load34 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 24(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX2-NEXT: je LBB22_28 ; AVX2-NEXT: LBB22_27: ## %cond.load37 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 26(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB22_30 ; AVX2-NEXT: LBB22_29: ## %cond.load40 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 28(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB22_32 ; AVX2-NEXT: LBB22_31: ## %cond.load43 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 30(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3609,51 +3609,51 @@ ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512F-NEXT: je LBB22_18 ; AVX512F-NEXT: LBB22_17: ## %cond.load22 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 16(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512F-NEXT: je LBB22_20 ; AVX512F-NEXT: LBB22_19: ## %cond.load25 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 18(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512F-NEXT: je LBB22_22 ; AVX512F-NEXT: LBB22_21: ## %cond.load28 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 20(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512F-NEXT: je LBB22_24 ; AVX512F-NEXT: LBB22_23: ## %cond.load31 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 22(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512F-NEXT: je LBB22_26 ; AVX512F-NEXT: LBB22_25: ## %cond.load34 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 24(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512F-NEXT: je LBB22_28 ; AVX512F-NEXT: LBB22_27: ## %cond.load37 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 26(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512F-NEXT: je LBB22_30 ; AVX512F-NEXT: LBB22_29: ## %cond.load40 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 28(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512F-NEXT: je LBB22_32 ; AVX512F-NEXT: LBB22_31: ## %cond.load43 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 30(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -3755,51 +3755,51 @@ ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512VLDQ-NEXT: je LBB22_18 ; AVX512VLDQ-NEXT: LBB22_17: ## %cond.load22 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 16(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512VLDQ-NEXT: je LBB22_20 ; AVX512VLDQ-NEXT: LBB22_19: ## %cond.load25 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 18(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512VLDQ-NEXT: je LBB22_22 ; AVX512VLDQ-NEXT: LBB22_21: ## %cond.load28 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 20(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512VLDQ-NEXT: je LBB22_24 ; AVX512VLDQ-NEXT: LBB22_23: ## %cond.load31 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 22(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512VLDQ-NEXT: je LBB22_26 ; AVX512VLDQ-NEXT: LBB22_25: ## %cond.load34 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 24(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512VLDQ-NEXT: je LBB22_28 ; AVX512VLDQ-NEXT: LBB22_27: ## %cond.load37 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 26(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512VLDQ-NEXT: je LBB22_30 ; AVX512VLDQ-NEXT: LBB22_29: ## %cond.load40 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 28(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512VLDQ-NEXT: je LBB22_32 ; AVX512VLDQ-NEXT: LBB22_31: ## %cond.load43 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 30(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLDQ-NEXT: retq ; @@ -5642,99 +5642,99 @@ ; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX2-NEXT: je LBB24_34 ; AVX2-NEXT: LBB24_33: ## %cond.load46 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 16(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000 ; AVX2-NEXT: je LBB24_36 ; AVX2-NEXT: LBB24_35: ## %cond.load49 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 17(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000 ; AVX2-NEXT: je LBB24_38 ; AVX2-NEXT: LBB24_37: ## %cond.load52 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 18(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000 ; AVX2-NEXT: je LBB24_40 ; AVX2-NEXT: LBB24_39: ## %cond.load55 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 19(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX2-NEXT: je LBB24_42 ; AVX2-NEXT: LBB24_41: ## %cond.load58 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 20(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX2-NEXT: je LBB24_44 ; AVX2-NEXT: LBB24_43: ## %cond.load61 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 21(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX2-NEXT: je LBB24_46 ; AVX2-NEXT: LBB24_45: ## %cond.load64 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 22(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX2-NEXT: je LBB24_48 ; AVX2-NEXT: LBB24_47: ## %cond.load67 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 23(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX2-NEXT: je LBB24_50 ; AVX2-NEXT: LBB24_49: ## %cond.load70 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 24(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; AVX2-NEXT: je LBB24_52 ; AVX2-NEXT: LBB24_51: ## %cond.load73 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 25(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; AVX2-NEXT: je LBB24_54 ; AVX2-NEXT: LBB24_53: ## %cond.load76 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 26(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; AVX2-NEXT: je LBB24_56 ; AVX2-NEXT: LBB24_55: ## %cond.load79 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 27(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX2-NEXT: je LBB24_58 ; AVX2-NEXT: LBB24_57: ## %cond.load82 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 28(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX2-NEXT: je LBB24_60 ; AVX2-NEXT: LBB24_59: ## %cond.load85 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 29(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX2-NEXT: je LBB24_62 ; AVX2-NEXT: LBB24_61: ## %cond.load88 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 30(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX2-NEXT: je LBB24_64 ; AVX2-NEXT: LBB24_63: ## %cond.load91 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb 31(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -5920,99 +5920,99 @@ ; AVX512F-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX512F-NEXT: je LBB24_34 ; AVX512F-NEXT: LBB24_33: ## %cond.load46 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 16(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $131072, %eax ## imm = 0x20000 ; AVX512F-NEXT: je LBB24_36 ; AVX512F-NEXT: LBB24_35: ## %cond.load49 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 17(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $262144, %eax ## imm = 0x40000 ; AVX512F-NEXT: je LBB24_38 ; AVX512F-NEXT: LBB24_37: ## %cond.load52 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 18(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $524288, %eax ## imm = 0x80000 ; AVX512F-NEXT: je LBB24_40 ; AVX512F-NEXT: LBB24_39: ## %cond.load55 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 19(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX512F-NEXT: je LBB24_42 ; AVX512F-NEXT: LBB24_41: ## %cond.load58 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 20(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX512F-NEXT: je LBB24_44 ; AVX512F-NEXT: LBB24_43: ## %cond.load61 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 21(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX512F-NEXT: je LBB24_46 ; AVX512F-NEXT: LBB24_45: ## %cond.load64 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 22(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX512F-NEXT: je LBB24_48 ; AVX512F-NEXT: LBB24_47: ## %cond.load67 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 23(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX512F-NEXT: je LBB24_50 ; AVX512F-NEXT: LBB24_49: ## %cond.load70 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 24(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; AVX512F-NEXT: je LBB24_52 ; AVX512F-NEXT: LBB24_51: ## %cond.load73 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 25(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; AVX512F-NEXT: je LBB24_54 ; AVX512F-NEXT: LBB24_53: ## %cond.load76 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 26(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; AVX512F-NEXT: je LBB24_56 ; AVX512F-NEXT: LBB24_55: ## %cond.load79 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 27(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX512F-NEXT: je LBB24_58 ; AVX512F-NEXT: LBB24_57: ## %cond.load82 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 28(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX512F-NEXT: je LBB24_60 ; AVX512F-NEXT: LBB24_59: ## %cond.load85 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 29(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX512F-NEXT: je LBB24_62 ; AVX512F-NEXT: LBB24_61: ## %cond.load88 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 30(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX512F-NEXT: je LBB24_64 ; AVX512F-NEXT: LBB24_63: ## %cond.load91 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastb 31(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -6198,99 +6198,83 @@ ; AVX512VLDQ-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX512VLDQ-NEXT: je LBB24_34 ; AVX512VLDQ-NEXT: LBB24_33: ## %cond.load46 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $0, 16(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 16(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $131072, %eax ## imm = 0x20000 ; AVX512VLDQ-NEXT: je LBB24_36 ; AVX512VLDQ-NEXT: LBB24_35: ## %cond.load49 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $1, 17(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 17(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $262144, %eax ## imm = 0x40000 ; AVX512VLDQ-NEXT: je LBB24_38 ; AVX512VLDQ-NEXT: LBB24_37: ## %cond.load52 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $2, 18(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 18(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $524288, %eax ## imm = 0x80000 ; AVX512VLDQ-NEXT: je LBB24_40 ; AVX512VLDQ-NEXT: LBB24_39: ## %cond.load55 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $3, 19(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 19(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX512VLDQ-NEXT: je LBB24_42 ; AVX512VLDQ-NEXT: LBB24_41: ## %cond.load58 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $4, 20(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 20(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX512VLDQ-NEXT: je LBB24_44 ; AVX512VLDQ-NEXT: LBB24_43: ## %cond.load61 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $5, 21(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 21(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX512VLDQ-NEXT: je LBB24_46 ; AVX512VLDQ-NEXT: LBB24_45: ## %cond.load64 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $6, 22(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 22(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX512VLDQ-NEXT: je LBB24_48 ; AVX512VLDQ-NEXT: LBB24_47: ## %cond.load67 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $7, 23(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 23(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX512VLDQ-NEXT: je LBB24_50 ; AVX512VLDQ-NEXT: LBB24_49: ## %cond.load70 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $8, 24(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 24(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $33554432, %eax ## imm = 0x2000000 ; AVX512VLDQ-NEXT: je LBB24_52 ; AVX512VLDQ-NEXT: LBB24_51: ## %cond.load73 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $9, 25(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 25(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; AVX512VLDQ-NEXT: je LBB24_54 ; AVX512VLDQ-NEXT: LBB24_53: ## %cond.load76 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $10, 26(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 26(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $134217728, %eax ## imm = 0x8000000 ; AVX512VLDQ-NEXT: je LBB24_56 ; AVX512VLDQ-NEXT: LBB24_55: ## %cond.load79 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $11, 27(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 27(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX512VLDQ-NEXT: je LBB24_58 ; AVX512VLDQ-NEXT: LBB24_57: ## %cond.load82 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $12, 28(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 28(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX512VLDQ-NEXT: je LBB24_60 ; AVX512VLDQ-NEXT: LBB24_59: ## %cond.load85 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $13, 29(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 29(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX512VLDQ-NEXT: je LBB24_62 ; AVX512VLDQ-NEXT: LBB24_61: ## %cond.load88 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 30(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX512VLDQ-NEXT: je LBB24_64 ; AVX512VLDQ-NEXT: LBB24_63: ## %cond.load91 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastb 31(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLDQ-NEXT: retq ; @@ -7093,24 +7077,21 @@ ; ; AVX2-LABEL: load_one_mask_bit_set3: ; AVX2: ## %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_one_mask_bit_set3: ; AVX512: ## %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1 +; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX512-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set3: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %ymm1 +; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; X86-AVX512-NEXT: retl %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> %val) ret <4 x i64> %res @@ -7126,17 +7107,15 @@ ; ; AVX-LABEL: load_one_mask_bit_set4: ; AVX: ## %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vbroadcastsd 24(%rdi), %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set4: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX512-NEXT: vbroadcastsd 24(%eax), %ymm1 +; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; X86-AVX512-NEXT: retl %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> %val) ret <4 x double> %res @@ -7152,24 +7131,37 @@ ; ; AVX1OR2-LABEL: load_one_mask_bit_set5: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1OR2-NEXT: vbroadcastsd 56(%rdi), %ymm2 +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1OR2-NEXT: retq ; -; AVX512-LABEL: load_one_mask_bit_set5: -; AVX512: ## %bb.0: -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_one_mask_bit_set5: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: movb $-128, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} +; AVX512F-NEXT: retq +; +; AVX512VLDQ-LABEL: load_one_mask_bit_set5: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $-128, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: load_one_mask_bit_set5: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: movb $-128, %al +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} +; AVX512VLBW-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set5: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; X86-AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; X86-AVX512-NEXT: movb $-128, %cl +; X86-AVX512-NEXT: kmovd %ecx, %k1 +; X86-AVX512-NEXT: vbroadcastsd 56(%eax), %zmm0 {%k1} ; X86-AVX512-NEXT: retl %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> , <8 x double> %val) ret <8 x double> %res @@ -7235,43 +7227,43 @@ ; ; AVX512F-LABEL: load_one_mask_bit_set6: ; AVX512F: ## %bb.0: +; AVX512F-NEXT: movb $4, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1} ; AVX512F-NEXT: movb $36, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: load_one_mask_bit_set6: ; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $4, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1} ; AVX512VLDQ-NEXT: movb $36, %al ; AVX512VLDQ-NEXT: kmovw %eax, %k1 ; AVX512VLDQ-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VLDQ-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0 ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: load_one_mask_bit_set6: ; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: movb $4, %al +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1} ; AVX512VLBW-NEXT: movb $36, %al ; AVX512VLBW-NEXT: kmovd %eax, %k1 ; AVX512VLBW-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} -; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VLBW-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2 -; AVX512VLBW-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set6: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movb $4, %cl +; X86-AVX512-NEXT: kmovd %ecx, %k1 +; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %zmm0 {%k1} ; X86-AVX512-NEXT: movb $36, %cl ; X86-AVX512-NEXT: kmovd %ecx, %k1 ; X86-AVX512-NEXT: vmovdqu64 64(%eax), %zmm1 {%k1} -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 -; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; X86-AVX512-NEXT: vinsertf32x4 $1, %xmm2, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %addr, i32 4, <16 x i1> , <16 x i64> %val) ret <16 x i64> %res