diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4642,7 +4642,6 @@ } assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); - unsigned VF = cast(SrcVTy)->getNumElements(); PointerType *PtrTy = dyn_cast(Ptr->getType()); if (!PtrTy && Ptr->getType()->isVectorTy()) PtrTy = dyn_cast( @@ -4650,22 +4649,10 @@ assert(PtrTy && "Unexpected type for Ptr argument"); unsigned AddressSpace = PtrTy->getAddressSpace(); - bool Scalarize = false; if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy, Align(Alignment))) || (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy, Align(Alignment)))) - Scalarize = true; - // Gather / Scatter for vector 2 is not profitable on KNL / SKX - // Vector-4 of gather/scatter instruction does not exist on KNL. - // We can extend it to 8 elements, but zeroing upper bits of - // the mask vector will add more instructions. Right now we give the scalar - // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction - // is better in the VariableMask case. - if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) - Scalarize = true; - - if (Scalarize) return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); @@ -4801,6 +4788,14 @@ unsigned NumElts = DataVTy->getNumElements(); if (NumElts == 1) return false; + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector-4 of gather/scatter instruction does not exist on KNL. + // We can extend it to 8 elements, but zeroing upper bits of + // the mask vector will add more instructions. Right now we give the scalar + // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter + // instruction is better in the VariableMask case. + if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))) + return false; } Type *ScalarTy = DataTy->getScalarType(); if (ScalarTy->isPointerTy()) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3009,13 +3009,21 @@ } return; } - // Vectorizing non-consecutive loads with `llvm.masked.gather`. - TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - buildTree_rec(PointerOps, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); - return; + Align CommonAlignment = cast(VL0)->getAlign(); + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast(V)->getAlign()); + if (TTI->isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), + CommonAlignment)) { + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, + S, UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() + << "SLP: added a vector of non-consecutive loads.\n"); + return; + } } LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); @@ -4077,6 +4085,10 @@ CostKind, VL0); } else { assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); + Align CommonAlignment = alignment; + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast(V)->getAlign()); VecLdCost = TTI->getGatherScatterOpCost( Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), /*VariableMask=*/false, alignment, CostKind, VL0); diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -137,13 +137,40 @@ ; ; AVX512F-LABEL: gather_v4f32_ptr_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $12, %k0, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je .LBB0_2 +; AVX512F-NEXT: # %bb.1: # %cond.load +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; AVX512F-NEXT: .LBB0_2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: je .LBB0_4 +; AVX512F-NEXT: # %bb.3: # %cond.load1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX512F-NEXT: .LBB0_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: jne .LBB0_5 +; AVX512F-NEXT: # %bb.6: # %else5 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB0_7 +; AVX512F-NEXT: .LBB0_8: # %else8 +; AVX512F-NEXT: vmovaps %xmm2, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB0_5: # %cond.load4 +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: je .LBB0_8 +; AVX512F-NEXT: .LBB0_7: # %cond.load7 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX512F-NEXT: vmovaps %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -304,13 +331,45 @@ ; ; AVX512F-LABEL: gather_v4f32_v4i32_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vmovq %rdi, %xmm3 +; AVX512F-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512F-NEXT: vpsllq $2, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $12, %k0, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je .LBB1_2 +; AVX512F-NEXT: # %bb.1: # %cond.load +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; AVX512F-NEXT: .LBB1_2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: je .LBB1_4 +; AVX512F-NEXT: # %bb.3: # %cond.load1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX512F-NEXT: .LBB1_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: jne .LBB1_5 +; AVX512F-NEXT: # %bb.6: # %else5 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB1_7 +; AVX512F-NEXT: .LBB1_8: # %else8 +; AVX512F-NEXT: vmovaps %xmm2, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB1_5: # %cond.load4 +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: je .LBB1_8 +; AVX512F-NEXT: .LBB1_7: # %cond.load7 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX512F-NEXT: vmovaps %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -468,13 +527,44 @@ ; ; AVX512F-LABEL: gather_v4f32_v4i64_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vmovq %rdi, %xmm3 +; AVX512F-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX512F-NEXT: vpsllq $2, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $12, %k0, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: je .LBB2_2 +; AVX512F-NEXT: # %bb.1: # %cond.load +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; AVX512F-NEXT: .LBB2_2: # %else +; AVX512F-NEXT: testb $2, %al +; AVX512F-NEXT: je .LBB2_4 +; AVX512F-NEXT: # %bb.3: # %cond.load1 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; AVX512F-NEXT: .LBB2_4: # %else2 +; AVX512F-NEXT: testb $4, %al +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: jne .LBB2_5 +; AVX512F-NEXT: # %bb.6: # %else5 +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: jne .LBB2_7 +; AVX512F-NEXT: .LBB2_8: # %else8 +; AVX512F-NEXT: vmovaps %xmm2, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; AVX512F-NEXT: .LBB2_5: # %cond.load4 +; AVX512F-NEXT: vmovq %xmm0, %rcx +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; AVX512F-NEXT: testb $8, %al +; AVX512F-NEXT: je .LBB2_8 +; AVX512F-NEXT: .LBB2_7: # %cond.load7 +; AVX512F-NEXT: vpextrq $1, %xmm0, %rax +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; AVX512F-NEXT: vmovaps %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -820,26 +820,88 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { ; KNL_64-LABEL: test15: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL_64-NEXT: kshiftlw $12, %k0, %k0 -; KNL_64-NEXT: kshiftrw $12, %k0, %k1 -; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vmovaps %xmm1, %xmm0 +; KNL_64-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL_64-NEXT: vpsllq $2, %ymm0, %ymm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1 +; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm1 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: # implicit-def: $xmm0 +; KNL_64-NEXT: je .LBB14_2 +; KNL_64-NEXT: # %bb.1: # %cond.load +; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_64-NEXT: .LBB14_2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB14_4 +; KNL_64-NEXT: # %bb.3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx +; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; KNL_64-NEXT: .LBB14_4: # %else2 +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL_64-NEXT: jne .LBB14_5 +; KNL_64-NEXT: # %bb.6: # %else5 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: jne .LBB14_7 +; KNL_64-NEXT: .LBB14_8: # %else8 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB14_5: # %cond.load4 +; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: je .LBB14_8 +; KNL_64-NEXT: .LBB14_7: # %cond.load7 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rax +; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test15: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL_32-NEXT: kshiftlw $12, %k0, %k0 -; KNL_32-NEXT: kshiftrw $12, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vmovaps %xmm1, %xmm0 +; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: # implicit-def: $xmm0 +; KNL_32-NEXT: jne .LBB14_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB14_3 +; KNL_32-NEXT: .LBB14_4: # %else2 +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: jne .LBB14_5 +; KNL_32-NEXT: .LBB14_6: # %else5 +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: jne .LBB14_7 +; KNL_32-NEXT: .LBB14_8: # %else8 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB14_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm1, %ecx +; KNL_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB14_4 +; KNL_32-NEXT: .LBB14_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx +; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: je .LBB14_6 +; KNL_32-NEXT: .LBB14_5: # %cond.load4 +; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx +; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: je .LBB14_8 +; KNL_32-NEXT: .LBB14_7: # %cond.load7 +; KNL_32-NEXT: vpextrd $3, %xmm1, %eax +; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -870,27 +932,99 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) { ; KNL_64-LABEL: test16: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL_64-NEXT: kshiftlw $12, %k0, %k0 -; KNL_64-NEXT: kshiftrw $12, %k0, %k1 -; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} -; KNL_64-NEXT: vmovapd %ymm2, %ymm0 +; KNL_64-NEXT: vpmovsxdq %xmm0, %ymm0 +; KNL_64-NEXT: vpsllq $3, %ymm0, %ymm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1 +; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: je .LBB15_2 +; KNL_64-NEXT: # %bb.1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; KNL_64-NEXT: .LBB15_2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB15_4 +; KNL_64-NEXT: # %bb.3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1] +; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; KNL_64-NEXT: .LBB15_4: # %else2 +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL_64-NEXT: jne .LBB15_5 +; KNL_64-NEXT: # %bb.6: # %else5 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: jne .LBB15_7 +; KNL_64-NEXT: .LBB15_8: # %else8 +; KNL_64-NEXT: vmovdqa %ymm2, %ymm0 +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB15_5: # %cond.load4 +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vextracti128 $1, %ymm2, %xmm1 +; KNL_64-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; KNL_64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: je .LBB15_8 +; KNL_64-NEXT: .LBB15_7: # %cond.load7 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vextracti128 $1, %ymm2, %xmm0 +; KNL_64-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 +; KNL_64-NEXT: vmovdqa %ymm2, %ymm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test16: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL_32-NEXT: kshiftlw $12, %k0, %k0 -; KNL_32-NEXT: kshiftrw $12, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1} -; KNL_32-NEXT: vmovapd %ymm2, %ymm0 +; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB15_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB15_3 +; KNL_32-NEXT: .LBB15_4: # %else2 +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: jne .LBB15_5 +; KNL_32-NEXT: .LBB15_6: # %else5 +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: jne .LBB15_7 +; KNL_32-NEXT: .LBB15_8: # %else8 +; KNL_32-NEXT: vmovdqa %ymm2, %ymm0 +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB15_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB15_4 +; KNL_32-NEXT: .LBB15_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx +; KNL_32-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1] +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: je .LBB15_6 +; KNL_32-NEXT: .LBB15_5: # %cond.load4 +; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx +; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm1 +; KNL_32-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; KNL_32-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: je .LBB15_8 +; KNL_32-NEXT: .LBB15_7: # %cond.load7 +; KNL_32-NEXT: vpextrd $3, %xmm0, %eax +; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm0 +; KNL_32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; KNL_32-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 +; KNL_32-NEXT: vmovdqa %ymm2, %ymm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test16: @@ -919,46 +1053,116 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { ; KNL_64-LABEL: test17: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} -; KNL_64-NEXT: vmovapd %xmm2, %xmm0 +; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 +; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 +; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB16_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB16_3 +; KNL_64-NEXT: .LBB16_4: # %else2 +; KNL_64-NEXT: vmovaps %xmm2, %xmm0 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB16_1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB16_4 +; KNL_64-NEXT: .LBB16_3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; KNL_64-NEXT: vmovaps %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test17: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1} -; KNL_32-NEXT: vmovapd %xmm2, %xmm0 +; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB16_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB16_3 +; KNL_32-NEXT: .LBB16_4: # %else2 +; KNL_32-NEXT: vmovaps %xmm2, %xmm0 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB16_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB16_4 +; KNL_32-NEXT: .LBB16_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %eax +; KNL_32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; KNL_32-NEXT: vmovaps %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test17: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1} -; SKX-NEXT: vmovapd %xmm2, %xmm0 +; SKX-NEXT: vpmovq2m %xmm1, %k0 +; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm1 +; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 +; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb $1, %al +; SKX-NEXT: jne .LBB16_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB16_3 +; SKX-NEXT: .LBB16_4: # %else2 +; SKX-NEXT: vmovaps %xmm2, %xmm0 +; SKX-NEXT: retq +; SKX-NEXT: .LBB16_1: # %cond.load +; SKX-NEXT: vmovq %xmm0, %rcx +; SKX-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB16_4 +; SKX-NEXT: .LBB16_3: # %cond.load1 +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; SKX-NEXT: vmovaps %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test17: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovq2m %xmm1, %k1 -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %xmm2 {%k1} -; SKX_32-NEXT: vmovapd %xmm2, %xmm0 +; SKX_32-NEXT: vpmovq2m %xmm1, %k0 +; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB16_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB16_3 +; SKX_32-NEXT: .LBB16_4: # %else2 +; SKX_32-NEXT: vmovaps %xmm2, %xmm0 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB16_1: # %cond.load +; SKX_32-NEXT: vmovd %xmm0, %ecx +; SKX_32-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB16_4 +; SKX_32-NEXT: .LBB16_3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm0, %eax +; SKX_32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; SKX_32-NEXT: vmovaps %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -976,25 +1180,78 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; KNL_64-LABEL: test18: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0 -; KNL_64-NEXT: kshiftlw $12, %k0, %k0 -; KNL_64-NEXT: kshiftrw $12, %k0, %k1 -; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: je .LBB17_2 +; KNL_64-NEXT: # %bb.1: # %cond.store +; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vmovss %xmm0, (%rcx) +; KNL_64-NEXT: .LBB17_2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB17_4 +; KNL_64-NEXT: # %bb.3: # %cond.store1 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx +; KNL_64-NEXT: vextractps $1, %xmm0, (%rcx) +; KNL_64-NEXT: .LBB17_4: # %else2 +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL_64-NEXT: jne .LBB17_5 +; KNL_64-NEXT: # %bb.6: # %else4 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: jne .LBB17_7 +; KNL_64-NEXT: .LBB17_8: # %else6 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB17_5: # %cond.store3 +; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vextractps $2, %xmm0, (%rcx) +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: je .LBB17_8 +; KNL_64-NEXT: .LBB17_7: # %cond.store5 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rax +; KNL_64-NEXT: vextractps $3, %xmm0, (%rax) ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test18: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0 -; KNL_32-NEXT: kshiftlw $12, %k0, %k0 -; KNL_32-NEXT: kshiftrw $12, %k0, %k1 -; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB17_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB17_3 +; KNL_32-NEXT: .LBB17_4: # %else2 +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: jne .LBB17_5 +; KNL_32-NEXT: .LBB17_6: # %else4 +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: jne .LBB17_7 +; KNL_32-NEXT: .LBB17_8: # %else6 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB17_1: # %cond.store +; KNL_32-NEXT: vmovd %xmm1, %ecx +; KNL_32-NEXT: vmovss %xmm0, (%ecx) +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB17_4 +; KNL_32-NEXT: .LBB17_3: # %cond.store1 +; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx +; KNL_32-NEXT: vextractps $1, %xmm0, (%ecx) +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: je .LBB17_6 +; KNL_32-NEXT: .LBB17_5: # %cond.store3 +; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx +; KNL_32-NEXT: vextractps $2, %xmm0, (%ecx) +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: je .LBB17_8 +; KNL_32-NEXT: .LBB17_7: # %cond.store5 +; KNL_32-NEXT: vpextrd $3, %xmm1, %eax +; KNL_32-NEXT: vextractps $3, %xmm0, (%eax) ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1019,26 +1276,85 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) { ; KNL_64-LABEL: test19: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 -; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL_64-NEXT: kshiftlw $12, %k0, %k0 -; KNL_64-NEXT: kshiftrw $12, %k0, %k1 -; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1} +; KNL_64-NEXT: vpsllq $3, %ymm2, %ymm1 +; KNL_64-NEXT: vmovq %rdi, %xmm2 +; KNL_64-NEXT: vpbroadcastq %xmm2, %ymm2 +; KNL_64-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: je .LBB18_2 +; KNL_64-NEXT: # %bb.1: # %cond.store +; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vmovlps %xmm0, (%rcx) +; KNL_64-NEXT: .LBB18_2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB18_4 +; KNL_64-NEXT: # %bb.3: # %cond.store1 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx +; KNL_64-NEXT: vmovhps %xmm0, (%rcx) +; KNL_64-NEXT: .LBB18_4: # %else2 +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: vextractf128 $1, %ymm0, %xmm0 +; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL_64-NEXT: jne .LBB18_5 +; KNL_64-NEXT: # %bb.6: # %else4 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: jne .LBB18_7 +; KNL_64-NEXT: .LBB18_8: # %else6 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB18_5: # %cond.store3 +; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vmovlps %xmm0, (%rcx) +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: je .LBB18_8 +; KNL_64-NEXT: .LBB18_7: # %cond.store5 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rax +; KNL_64-NEXT: vmovhps %xmm0, (%rax) ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test19: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 -; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL_32-NEXT: kshiftlw $12, %k0, %k0 -; KNL_32-NEXT: kshiftrw $12, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1} +; KNL_32-NEXT: vpmovqd %zmm2, %ymm1 +; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 +; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: je .LBB18_2 +; KNL_32-NEXT: # %bb.1: # %cond.store +; KNL_32-NEXT: vmovd %xmm1, %ecx +; KNL_32-NEXT: vmovlps %xmm0, (%ecx) +; KNL_32-NEXT: .LBB18_2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB18_4 +; KNL_32-NEXT: # %bb.3: # %cond.store1 +; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx +; KNL_32-NEXT: vmovhps %xmm0, (%ecx) +; KNL_32-NEXT: .LBB18_4: # %else2 +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; KNL_32-NEXT: jne .LBB18_5 +; KNL_32-NEXT: # %bb.6: # %else4 +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: jne .LBB18_7 +; KNL_32-NEXT: .LBB18_8: # %else6 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB18_5: # %cond.store3 +; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx +; KNL_32-NEXT: vmovlps %xmm0, (%ecx) +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: je .LBB18_8 +; KNL_32-NEXT: .LBB18_7: # %cond.store5 +; KNL_32-NEXT: vpextrd $3, %xmm1, %eax +; KNL_32-NEXT: vmovhps %xmm0, (%eax) ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1067,40 +1383,94 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; KNL_64-LABEL: test20: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB19_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB19_3 +; KNL_64-NEXT: .LBB19_4: # %else2 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB19_1: # %cond.store +; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vmovd %xmm0, (%rcx) +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB19_4 +; KNL_64-NEXT: .LBB19_3: # %cond.store1 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rax +; KNL_64-NEXT: vextractps $1, %xmm0, (%rax) ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test20: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vscatterdps %zmm0, (,%zmm1) {%k1} +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB19_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB19_3 +; KNL_32-NEXT: .LBB19_4: # %else2 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB19_1: # %cond.store +; KNL_32-NEXT: vmovd %xmm1, %ecx +; KNL_32-NEXT: vmovd %xmm0, (%ecx) +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB19_4 +; KNL_32-NEXT: .LBB19_3: # %cond.store1 +; KNL_32-NEXT: vpextrd $1, %xmm1, %eax +; KNL_32-NEXT: vextractps $1, %xmm0, (%eax) ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test20: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vscatterqps %xmm0, (,%xmm1) {%k1} +; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb $1, %al +; SKX-NEXT: jne .LBB19_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB19_3 +; SKX-NEXT: .LBB19_4: # %else2 +; SKX-NEXT: retq +; SKX-NEXT: .LBB19_1: # %cond.store +; SKX-NEXT: vmovq %xmm1, %rcx +; SKX-NEXT: vmovd %xmm0, (%rcx) +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB19_4 +; SKX-NEXT: .LBB19_3: # %cond.store1 +; SKX-NEXT: vpextrq $1, %xmm1, %rax +; SKX-NEXT: vextractps $1, %xmm0, (%rax) ; SKX-NEXT: retq ; ; SKX_32-LABEL: test20: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX_32-NEXT: vpmovq2m %xmm2, %k1 -; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} +; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB19_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB19_3 +; SKX_32-NEXT: .LBB19_4: # %else2 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB19_1: # %cond.store +; SKX_32-NEXT: vmovd %xmm1, %ecx +; SKX_32-NEXT: vmovd %xmm0, (%ecx) +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB19_4 +; SKX_32-NEXT: .LBB19_3: # %cond.store1 +; SKX_32-NEXT: vpextrd $1, %xmm1, %eax +; SKX_32-NEXT: vextractps $1, %xmm0, (%eax) ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) ret void @@ -1110,40 +1480,94 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; KNL_64-LABEL: test21: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB20_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB20_3 +; KNL_64-NEXT: .LBB20_4: # %else2 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB20_1: # %cond.store +; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vmovss %xmm0, (%rcx) +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB20_4 +; KNL_64-NEXT: .LBB20_3: # %cond.store1 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rax +; KNL_64-NEXT: vextractps $1, %xmm0, (%rax) ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test21: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB20_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB20_3 +; KNL_32-NEXT: .LBB20_4: # %else2 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB20_1: # %cond.store +; KNL_32-NEXT: vmovd %xmm1, %ecx +; KNL_32-NEXT: vmovss %xmm0, (%ecx) +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB20_4 +; KNL_32-NEXT: .LBB20_3: # %cond.store1 +; KNL_32-NEXT: vpextrd $1, %xmm1, %eax +; KNL_32-NEXT: vextractps $1, %xmm0, (%eax) ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test21: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} +; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb $1, %al +; SKX-NEXT: jne .LBB20_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB20_3 +; SKX-NEXT: .LBB20_4: # %else2 +; SKX-NEXT: retq +; SKX-NEXT: .LBB20_1: # %cond.store +; SKX-NEXT: vmovq %xmm1, %rcx +; SKX-NEXT: vmovss %xmm0, (%rcx) +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB20_4 +; SKX-NEXT: .LBB20_3: # %cond.store1 +; SKX-NEXT: vpextrq $1, %xmm1, %rax +; SKX-NEXT: vextractps $1, %xmm0, (%rax) ; SKX-NEXT: retq ; ; SKX_32-LABEL: test21: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX_32-NEXT: vpmovq2m %xmm2, %k1 -; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} +; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB20_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB20_3 +; SKX_32-NEXT: .LBB20_4: # %else2 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB20_1: # %cond.store +; SKX_32-NEXT: vmovd %xmm1, %ecx +; SKX_32-NEXT: vmovss %xmm0, (%ecx) +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB20_4 +; SKX_32-NEXT: .LBB20_3: # %cond.store1 +; SKX_32-NEXT: vpextrd $1, %xmm1, %eax +; SKX_32-NEXT: vextractps $1, %xmm0, (%eax) ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) ret void @@ -1155,27 +1579,62 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) { ; KNL_64-LABEL: test22: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} +; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 +; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 +; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB21_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB21_3 +; KNL_64-NEXT: .LBB21_4: # %else2 +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB21_1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; KNL_64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB21_4 +; KNL_64-NEXT: .LBB21_3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; KNL_64-NEXT: vmovaps %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test22: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm2 {%k1} +; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB21_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB21_3 +; KNL_32-NEXT: .LBB21_4: # %else2 +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB21_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; KNL_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB21_4 +; KNL_32-NEXT: .LBB21_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %eax +; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; KNL_32-NEXT: vmovaps %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -1183,17 +1642,56 @@ ; SKX-LABEL: test22: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vpmovq2m %xmm1, %k0 +; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm1 +; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 +; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb $1, %al +; SKX-NEXT: jne .LBB21_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB21_3 +; SKX-NEXT: .LBB21_4: # %else2 +; SKX-NEXT: vmovdqa %xmm2, %xmm0 +; SKX-NEXT: retq +; SKX-NEXT: .LBB21_1: # %cond.load +; SKX-NEXT: vmovq %xmm0, %rcx +; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SKX-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB21_4 +; SKX-NEXT: .LBB21_3: # %cond.load1 +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; SKX-NEXT: vmovaps %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test22: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovq2m %xmm1, %k1 -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vpmovq2m %xmm1, %k0 +; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB21_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB21_3 +; SKX_32-NEXT: .LBB21_4: # %else2 +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB21_1: # %cond.load +; SKX_32-NEXT: vmovd %xmm0, %ecx +; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SKX_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB21_4 +; SKX_32-NEXT: .LBB21_3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm0, %eax +; SKX_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; SKX_32-NEXT: vmovaps %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -1205,27 +1703,62 @@ define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) { ; KNL_64-LABEL: test22a: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 +; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB22_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB22_3 +; KNL_64-NEXT: .LBB22_4: # %else2 +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB22_1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; KNL_64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB22_4 +; KNL_64-NEXT: .LBB22_3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; KNL_64-NEXT: vmovaps %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test22a: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} +; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB22_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB22_3 +; KNL_32-NEXT: .LBB22_4: # %else2 +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB22_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; KNL_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB22_4 +; KNL_32-NEXT: .LBB22_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %eax +; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; KNL_32-NEXT: vmovaps %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -1233,17 +1766,56 @@ ; SKX-LABEL: test22a: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vpmovq2m %xmm1, %k0 +; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm1 +; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb $1, %al +; SKX-NEXT: jne .LBB22_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB22_3 +; SKX-NEXT: .LBB22_4: # %else2 +; SKX-NEXT: vmovdqa %xmm2, %xmm0 +; SKX-NEXT: retq +; SKX-NEXT: .LBB22_1: # %cond.load +; SKX-NEXT: vmovq %xmm0, %rcx +; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SKX-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB22_4 +; SKX-NEXT: .LBB22_3: # %cond.load1 +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; SKX-NEXT: vmovaps %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test22a: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovq2m %xmm1, %k1 -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vpmovq2m %xmm1, %k0 +; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB22_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB22_3 +; SKX_32-NEXT: .LBB22_4: # %else2 +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB22_1: # %cond.load +; SKX_32-NEXT: vmovd %xmm0, %ecx +; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SKX_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB22_4 +; SKX_32-NEXT: .LBB22_3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm0, %eax +; SKX_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; SKX_32-NEXT: vmovaps %xmm2, %xmm0 ; SKX_32-NEXT: retl %gep.random = getelementptr float, float* %base, <2 x i64> %ind @@ -1257,27 +1829,60 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: test23: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 +; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 +; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB23_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB23_3 +; KNL_64-NEXT: .LBB23_4: # %else2 +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB23_1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB23_4 +; KNL_64-NEXT: .LBB23_3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} +; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB23_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB23_3 +; KNL_32-NEXT: .LBB23_4: # %else2 +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB23_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB23_4 +; KNL_32-NEXT: .LBB23_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %eax +; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -1285,17 +1890,54 @@ ; SKX-LABEL: test23: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vpmovq2m %xmm1, %k0 +; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm1 +; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 +; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb $1, %al +; SKX-NEXT: jne .LBB23_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB23_3 +; SKX-NEXT: .LBB23_4: # %else2 +; SKX-NEXT: vmovdqa %xmm2, %xmm0 +; SKX-NEXT: retq +; SKX-NEXT: .LBB23_1: # %cond.load +; SKX-NEXT: vmovq %xmm0, %rcx +; SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB23_4 +; SKX-NEXT: .LBB23_3: # %cond.load1 +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test23: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovq2m %xmm1, %k1 -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vpmovq2m %xmm1, %k0 +; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB23_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB23_3 +; SKX_32-NEXT: .LBB23_4: # %else2 +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB23_1: # %cond.load +; SKX_32-NEXT: vmovd %xmm0, %ecx +; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB23_4 +; SKX_32-NEXT: .LBB23_3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm0, %eax +; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -1307,27 +1949,60 @@ define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: test23b: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 +; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB24_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB24_3 +; KNL_64-NEXT: .LBB24_4: # %else2 +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB24_1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB24_4 +; KNL_64-NEXT: .LBB24_3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23b: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} +; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB24_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB24_3 +; KNL_32-NEXT: .LBB24_4: # %else2 +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB24_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB24_4 +; KNL_32-NEXT: .LBB24_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %eax +; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -1335,17 +2010,54 @@ ; SKX-LABEL: test23b: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm2 {%k1} +; SKX-NEXT: vpmovq2m %xmm1, %k0 +; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm1 +; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb $1, %al +; SKX-NEXT: jne .LBB24_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB24_3 +; SKX-NEXT: .LBB24_4: # %else2 +; SKX-NEXT: vmovdqa %xmm2, %xmm0 +; SKX-NEXT: retq +; SKX-NEXT: .LBB24_1: # %cond.load +; SKX-NEXT: vmovq %xmm0, %rcx +; SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB24_4 +; SKX-NEXT: .LBB24_3: # %cond.load1 +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test23b: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovq2m %xmm1, %k1 -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm2 {%k1} +; SKX_32-NEXT: vpmovq2m %xmm1, %k0 +; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB24_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB24_3 +; SKX_32-NEXT: .LBB24_4: # %else2 +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB24_1: # %cond.load +; SKX_32-NEXT: vmovd %xmm0, %ecx +; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB24_4 +; SKX_32-NEXT: .LBB24_3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm0, %eax +; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind @@ -1356,40 +2068,48 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test24: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_64-NEXT: movw $3, %ax -; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 -; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 +; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 +; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: vmovq %xmm0, %rax +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test24: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: movw $3, %cx -; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 -; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: vmovd %xmm0, %eax +; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx +; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test24: ; SKX: # %bb.0: -; SKX-NEXT: movb $3, %al -; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vmovdqa %xmm1, %xmm0 +; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm1 +; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 +; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vpextrq $1, %xmm0, %rcx +; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test24: ; SKX_32: # %bb.0: -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: movb $3, %cl -; SKX_32-NEXT: kmovw %ecx, %k1 -; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 +; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: vmovd %xmm0, %eax +; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx +; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind @@ -1400,27 +2120,62 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) { ; KNL_64-LABEL: test25: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 +; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 +; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB26_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB26_3 +; KNL_64-NEXT: .LBB26_4: # %else2 +; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB26_1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2 +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB26_4 +; KNL_64-NEXT: .LBB26_3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test25: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB26_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB26_3 +; KNL_32-NEXT: .LBB26_4: # %else2 +; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB26_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm1 +; KNL_32-NEXT: vpinsrd $1, 4(%ecx), %xmm1, %xmm2 +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB26_4 +; KNL_32-NEXT: .LBB26_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %eax +; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm0 +; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm2 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -1428,17 +2183,56 @@ ; SKX-LABEL: test25: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX-NEXT: vpmovq2m %xmm1, %k1 -; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm2 {%k1} +; SKX-NEXT: vpmovq2m %xmm1, %k0 +; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm1 +; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 +; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb $1, %al +; SKX-NEXT: jne .LBB26_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB26_3 +; SKX-NEXT: .LBB26_4: # %else2 +; SKX-NEXT: vmovdqa %xmm2, %xmm0 +; SKX-NEXT: retq +; SKX-NEXT: .LBB26_1: # %cond.load +; SKX-NEXT: vmovq %xmm0, %rcx +; SKX-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2 +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB26_4 +; SKX-NEXT: .LBB26_3: # %cond.load1 +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2 ; SKX-NEXT: vmovdqa %xmm2, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test25: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 -; SKX_32-NEXT: vpmovq2m %xmm1, %k1 -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm2 {%k1} +; SKX_32-NEXT: vpmovq2m %xmm1, %k0 +; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB26_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB26_3 +; SKX_32-NEXT: .LBB26_4: # %else2 +; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB26_1: # %cond.load +; SKX_32-NEXT: vmovd %xmm0, %ecx +; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm1 +; SKX_32-NEXT: vpinsrd $1, 4(%ecx), %xmm1, %xmm2 +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB26_4 +; SKX_32-NEXT: .LBB26_3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm0, %eax +; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm0 +; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm2 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> @@ -1450,40 +2244,52 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; KNL_64-LABEL: test26: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; KNL_64-NEXT: movb $3, %al -; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm1 {%k1} -; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 -; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 +; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm2 +; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 +; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; KNL_64-NEXT: vmovq %xmm0, %rax +; KNL_64-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test26: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: movb $3, %cl -; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm1 {%k1} -; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 -; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 +; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; KNL_32-NEXT: vmovd %xmm0, %eax +; KNL_32-NEXT: vpinsrd $0, (%eax), %xmm1, %xmm1 +; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm1, %xmm1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %eax +; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm0 +; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test26: ; SKX: # %bb.0: -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm1 {%k1} -; SKX-NEXT: vmovdqa %xmm1, %xmm0 +; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 +; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1 +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test26: ; SKX_32: # %bb.0: -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm1 {%k1} -; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 +; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: vmovd %xmm0, %eax +; SKX_32-NEXT: vpinsrd $0, (%eax), %xmm1, %xmm1 +; SKX_32-NEXT: vpinsrd $1, 4(%eax), %xmm1, %xmm1 +; SKX_32-NEXT: vpextrd $1, %xmm0, %eax +; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm0 +; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind @@ -1495,40 +2301,48 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test27: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_64-NEXT: movw $3, %ax -; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vmovaps %xmm1, %xmm0 -; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 +; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 +; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: vmovq %xmm0, %rax +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test27: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: movw $3, %cx -; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vmovaps %xmm1, %xmm0 -; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: vmovd %xmm0, %eax +; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx +; KNL_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; KNL_32-NEXT: retl ; ; SKX-LABEL: test27: ; SKX: # %bb.0: -; SKX-NEXT: movb $3, %al -; SKX-NEXT: kmovw %eax, %k1 -; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm1 +; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 +; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vpextrq $1, %xmm0, %rcx +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; SKX-NEXT: retq ; ; SKX_32-LABEL: test27: ; SKX_32: # %bb.0: -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: movb $3, %cl -; SKX_32-NEXT: kmovw %ecx, %k1 -; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} -; SKX_32-NEXT: vmovaps %xmm1, %xmm0 +; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: vmovd %xmm0, %eax +; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx +; SKX_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind @@ -1540,35 +2354,34 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; KNL_64-LABEL: test28: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; KNL_64-NEXT: movb $3, %al -; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} -; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: vmovq %xmm1, %rax +; KNL_64-NEXT: vmovss %xmm0, (%rax) +; KNL_64-NEXT: vpextrq $1, %xmm1, %rax +; KNL_64-NEXT: vextractps $1, %xmm0, (%rax) ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test28: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL_32-NEXT: movw $3, %ax -; KNL_32-NEXT: kmovw %eax, %k1 -; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} -; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: vmovd %xmm1, %eax +; KNL_32-NEXT: vmovss %xmm0, (%eax) +; KNL_32-NEXT: vpextrd $1, %xmm1, %eax +; KNL_32-NEXT: vextractps $1, %xmm0, (%eax) ; KNL_32-NEXT: retl ; ; SKX-LABEL: test28: ; SKX: # %bb.0: -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} +; SKX-NEXT: vmovq %xmm1, %rax +; SKX-NEXT: vmovss %xmm0, (%rax) +; SKX-NEXT: vpextrq $1, %xmm1, %rax +; SKX-NEXT: vextractps $1, %xmm0, (%rax) ; SKX-NEXT: retq ; ; SKX_32-LABEL: test28: ; SKX_32: # %bb.0: -; SKX_32-NEXT: movb $3, %al -; SKX_32-NEXT: kmovw %eax, %k1 -; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} +; SKX_32-NEXT: vmovd %xmm1, %eax +; SKX_32-NEXT: vmovss %xmm0, (%eax) +; SKX_32-NEXT: vpextrd $1, %xmm1, %eax +; SKX_32-NEXT: vextractps $1, %xmm0, (%eax) ; SKX_32-NEXT: retl call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> ) ret void @@ -2385,15 +3198,119 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) { ; KNL_64-LABEL: test_pr28312: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL_64-NEXT: kshiftlw $12, %k0, %k0 -; KNL_64-NEXT: kshiftrw $12, %k0, %k1 -; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1} -; KNL_64-NEXT: vpaddq %ymm1, %ymm1, %ymm0 -; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: # implicit-def: $ymm1 +; KNL_64-NEXT: je .LBB42_2 +; KNL_64-NEXT: # %bb.1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; KNL_64-NEXT: .LBB42_2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB42_4 +; KNL_64-NEXT: # %bb.3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm1, %xmm2 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; KNL_64-NEXT: .LBB42_4: # %else2 +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm2 +; KNL_64-NEXT: je .LBB42_6 +; KNL_64-NEXT: # %bb.5: # %cond.load4 +; KNL_64-NEXT: vmovq %xmm2, %rcx +; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm3 +; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm3, %xmm3 +; KNL_64-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; KNL_64-NEXT: .LBB42_6: # %else5 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: je .LBB42_8 +; KNL_64-NEXT: # %bb.7: # %cond.load7 +; KNL_64-NEXT: vpextrq $1, %xmm2, %rax +; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm3 +; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm3, %xmm3 +; KNL_64-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; KNL_64-NEXT: .LBB42_8: # %else8 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: # implicit-def: $ymm3 +; KNL_64-NEXT: jne .LBB42_9 +; KNL_64-NEXT: # %bb.10: # %else15 +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB42_11 +; KNL_64-NEXT: .LBB42_12: # %else21 +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: jne .LBB42_13 +; KNL_64-NEXT: .LBB42_14: # %else27 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: je .LBB42_16 +; KNL_64-NEXT: .LBB42_15: # %cond.load29 +; KNL_64-NEXT: vpextrq $1, %xmm2, %rax +; KNL_64-NEXT: vextracti128 $1, %ymm3, %xmm4 +; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm4, %xmm4 +; KNL_64-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; KNL_64-NEXT: .LBB42_16: # %else33 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: # implicit-def: $ymm4 +; KNL_64-NEXT: jne .LBB42_17 +; KNL_64-NEXT: # %bb.18: # %else40 +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB42_19 +; KNL_64-NEXT: .LBB42_20: # %else46 +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: jne .LBB42_21 +; KNL_64-NEXT: .LBB42_22: # %else52 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: je .LBB42_24 +; KNL_64-NEXT: .LBB42_23: # %cond.load54 +; KNL_64-NEXT: vpextrq $1, %xmm2, %rax +; KNL_64-NEXT: vextracti128 $1, %ymm4, %xmm0 +; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 +; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm4 +; KNL_64-NEXT: .LBB42_24: # %else58 +; KNL_64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 +; KNL_64-NEXT: vpaddq %ymm4, %ymm0, %ymm0 ; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB42_9: # %cond.load11 +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB42_12 +; KNL_64-NEXT: .LBB42_11: # %cond.load17 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm3, %xmm4 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: je .LBB42_14 +; KNL_64-NEXT: .LBB42_13: # %cond.load23 +; KNL_64-NEXT: vmovq %xmm2, %rcx +; KNL_64-NEXT: vextracti128 $1, %ymm3, %xmm4 +; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm4, %xmm4 +; KNL_64-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: jne .LBB42_15 +; KNL_64-NEXT: jmp .LBB42_16 +; KNL_64-NEXT: .LBB42_17: # %cond.load36 +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB42_20 +; KNL_64-NEXT: .LBB42_19: # %cond.load42 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm4, %xmm0 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: je .LBB42_22 +; KNL_64-NEXT: .LBB42_21: # %cond.load48 +; KNL_64-NEXT: vmovq %xmm2, %rcx +; KNL_64-NEXT: vextracti128 $1, %ymm4, %xmm0 +; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm0, %xmm0 +; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm4 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: jne .LBB42_23 +; KNL_64-NEXT: jmp .LBB42_24 ; ; KNL_32-LABEL: test_pr28312: ; KNL_32: # %bb.0: @@ -2402,20 +3319,131 @@ ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp ; KNL_32-NEXT: .cfi_def_cfa_register %ebp +; KNL_32-NEXT: pushl %ebx +; KNL_32-NEXT: pushl %esi ; KNL_32-NEXT: andl $-32, %esp ; KNL_32-NEXT: subl $32, %esp -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; KNL_32-NEXT: .cfi_offset %esi, -16 +; KNL_32-NEXT: .cfi_offset %ebx, -12 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL_32-NEXT: kshiftlw $12, %k0, %k0 -; KNL_32-NEXT: kshiftrw $12, %k0, %k1 -; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k1} -; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0 +; KNL_32-NEXT: kmovw %k0, %ebx +; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: vmovd %xmm0, %eax +; KNL_32-NEXT: # implicit-def: $ymm1 +; KNL_32-NEXT: je .LBB42_2 +; KNL_32-NEXT: # %bb.1: # %cond.load +; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; KNL_32-NEXT: .LBB42_2: # %else +; KNL_32-NEXT: testb $2, %bl +; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx +; KNL_32-NEXT: je .LBB42_4 +; KNL_32-NEXT: # %bb.3: # %cond.load1 +; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm2 +; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; KNL_32-NEXT: .LBB42_4: # %else2 +; KNL_32-NEXT: testb $4, %bl +; KNL_32-NEXT: vpextrd $2, %xmm0, %edx +; KNL_32-NEXT: je .LBB42_6 +; KNL_32-NEXT: # %bb.5: # %cond.load4 +; KNL_32-NEXT: vextracti128 $1, %ymm1, %xmm2 +; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm2, %xmm2 +; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm2, %xmm2 +; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL_32-NEXT: .LBB42_6: # %else5 +; KNL_32-NEXT: testb $8, %bl +; KNL_32-NEXT: vpextrd $3, %xmm0, %esi +; KNL_32-NEXT: je .LBB42_8 +; KNL_32-NEXT: # %bb.7: # %cond.load7 +; KNL_32-NEXT: vextracti128 $1, %ymm1, %xmm0 +; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0 +; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm0, %xmm0 +; KNL_32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; KNL_32-NEXT: .LBB42_8: # %else8 +; KNL_32-NEXT: kmovw %k0, %ebx +; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: # implicit-def: $ymm0 +; KNL_32-NEXT: jne .LBB42_9 +; KNL_32-NEXT: # %bb.10: # %else15 +; KNL_32-NEXT: testb $2, %bl +; KNL_32-NEXT: jne .LBB42_11 +; KNL_32-NEXT: .LBB42_12: # %else21 +; KNL_32-NEXT: testb $4, %bl +; KNL_32-NEXT: jne .LBB42_13 +; KNL_32-NEXT: .LBB42_14: # %else27 +; KNL_32-NEXT: testb $8, %bl +; KNL_32-NEXT: je .LBB42_16 +; KNL_32-NEXT: .LBB42_15: # %cond.load29 +; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm2 +; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm2, %xmm2 +; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm2, %xmm2 +; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; KNL_32-NEXT: .LBB42_16: # %else33 +; KNL_32-NEXT: kmovw %k0, %ebx +; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: # implicit-def: $ymm2 +; KNL_32-NEXT: jne .LBB42_17 +; KNL_32-NEXT: # %bb.18: # %else40 +; KNL_32-NEXT: testb $2, %bl +; KNL_32-NEXT: jne .LBB42_19 +; KNL_32-NEXT: .LBB42_20: # %else46 +; KNL_32-NEXT: testb $4, %bl +; KNL_32-NEXT: jne .LBB42_21 +; KNL_32-NEXT: .LBB42_22: # %else52 +; KNL_32-NEXT: testb $8, %bl +; KNL_32-NEXT: je .LBB42_24 +; KNL_32-NEXT: .LBB42_23: # %cond.load54 +; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm3 +; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm3, %xmm3 +; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm3, %xmm3 +; KNL_32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; KNL_32-NEXT: .LBB42_24: # %else58 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 -; KNL_32-NEXT: movl %ebp, %esp +; KNL_32-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; KNL_32-NEXT: leal -8(%ebp), %esp +; KNL_32-NEXT: popl %esi +; KNL_32-NEXT: popl %ebx ; KNL_32-NEXT: popl %ebp ; KNL_32-NEXT: .cfi_def_cfa %esp, 4 ; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB42_9: # %cond.load11 +; KNL_32-NEXT: .cfi_def_cfa %ebp, 8 +; KNL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; KNL_32-NEXT: testb $2, %bl +; KNL_32-NEXT: je .LBB42_12 +; KNL_32-NEXT: .LBB42_11: # %cond.load17 +; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm2 +; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; KNL_32-NEXT: testb $4, %bl +; KNL_32-NEXT: je .LBB42_14 +; KNL_32-NEXT: .LBB42_13: # %cond.load23 +; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm2 +; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm2, %xmm2 +; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm2, %xmm2 +; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; KNL_32-NEXT: testb $8, %bl +; KNL_32-NEXT: jne .LBB42_15 +; KNL_32-NEXT: jmp .LBB42_16 +; KNL_32-NEXT: .LBB42_17: # %cond.load36 +; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; KNL_32-NEXT: testb $2, %bl +; KNL_32-NEXT: je .LBB42_20 +; KNL_32-NEXT: .LBB42_19: # %cond.load42 +; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm2, %xmm3 +; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm3, %xmm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; KNL_32-NEXT: testb $4, %bl +; KNL_32-NEXT: je .LBB42_22 +; KNL_32-NEXT: .LBB42_21: # %cond.load48 +; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm3 +; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm3, %xmm3 +; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm3, %xmm3 +; KNL_32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; KNL_32-NEXT: testb $8, %bl +; KNL_32-NEXT: jne .LBB42_23 +; KNL_32-NEXT: jmp .LBB42_24 ; ; SKX-LABEL: test_pr28312: ; SKX: # %bb.0: @@ -2612,32 +3640,66 @@ define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <2 x float> %src0) { ; KNL_64-LABEL: large_index: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 ; KNL_64-NEXT: vmovq %rcx, %xmm0 ; KNL_64-NEXT: vmovq %rsi, %xmm2 ; KNL_64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k1} +; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm2 +; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 +; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB47_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB47_3 +; KNL_64-NEXT: .LBB47_4: # %else2 +; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB47_1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; KNL_64-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB47_4 +; KNL_64-NEXT: .LBB47_3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; KNL_64-NEXT: vmovaps %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: large_index: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; KNL_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; KNL_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm1 {%k1} +; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 +; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB47_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB47_3 +; KNL_32-NEXT: .LBB47_4: # %else2 +; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB47_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; KNL_32-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB47_4 +; KNL_32-NEXT: .LBB47_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %eax +; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; KNL_32-NEXT: vmovaps %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -2645,24 +3707,60 @@ ; SKX-LABEL: large_index: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vpmovq2m %xmm0, %k1 +; SKX-NEXT: vpmovq2m %xmm0, %k0 ; SKX-NEXT: vmovq %rcx, %xmm0 ; SKX-NEXT: vmovq %rsi, %xmm2 ; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm1 {%k1} +; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb $1, %al +; SKX-NEXT: jne .LBB47_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB47_3 +; SKX-NEXT: .LBB47_4: # %else2 +; SKX-NEXT: vmovdqa %xmm1, %xmm0 +; SKX-NEXT: retq +; SKX-NEXT: .LBB47_1: # %cond.load +; SKX-NEXT: vmovq %xmm0, %rcx +; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB47_4 +; SKX-NEXT: .LBB47_3: # %cond.load1 +; SKX-NEXT: vpextrq $1, %xmm0, %rax +; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; SKX-NEXT: vmovaps %xmm1, %xmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: large_index: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX_32-NEXT: vpmovq2m %xmm0, %k1 -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vpmovq2m %xmm0, %k0 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; SKX_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; SKX_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm1 {%k1} +; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB47_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB47_3 +; SKX_32-NEXT: .LBB47_4: # %else2 +; SKX_32-NEXT: vmovaps %xmm1, %xmm0 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB47_1: # %cond.load +; SKX_32-NEXT: vmovd %xmm0, %ecx +; SKX_32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SKX_32-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB47_4 +; SKX_32-NEXT: .LBB47_3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm0, %eax +; SKX_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] ; SKX_32-NEXT: vmovaps %xmm1, %xmm0 ; SKX_32-NEXT: retl %gep.random = getelementptr float, float* %base, <2 x i128> %ind @@ -2839,42 +3937,108 @@ define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) { ; KNL_64-LABEL: test_scatter_2i32_index: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1} +; KNL_64-NEXT: vpmovsxdq %xmm1, %xmm1 +; KNL_64-NEXT: vpsllq $3, %xmm1, %xmm1 +; KNL_64-NEXT: vmovq %rdi, %xmm2 +; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 +; KNL_64-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB52_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB52_3 +; KNL_64-NEXT: .LBB52_4: # %else2 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB52_1: # %cond.store +; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vmovlps %xmm0, (%rcx) +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB52_4 +; KNL_64-NEXT: .LBB52_3: # %cond.store1 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rax +; KNL_64-NEXT: vmovhps %xmm0, (%rax) ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_scatter_2i32_index: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vscatterdpd %zmm0, (%eax,%ymm1,8) {%k1} +; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 +; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB52_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB52_3 +; KNL_32-NEXT: .LBB52_4: # %else2 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB52_1: # %cond.store +; KNL_32-NEXT: vmovd %xmm1, %ecx +; KNL_32-NEXT: vmovlps %xmm0, (%ecx) +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB52_4 +; KNL_32-NEXT: .LBB52_3: # %cond.store1 +; KNL_32-NEXT: vpextrd $1, %xmm1, %eax +; KNL_32-NEXT: vmovhps %xmm0, (%eax) ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX-LABEL: test_scatter_2i32_index: ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX-NEXT: vpmovq2m %xmm2, %k1 -; SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1} +; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; SKX-NEXT: vpmovsxdq %xmm1, %xmm1 +; SKX-NEXT: vpsllq $3, %xmm1, %xmm1 +; SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; SKX-NEXT: kmovw %k0, %eax +; SKX-NEXT: testb $1, %al +; SKX-NEXT: jne .LBB52_1 +; SKX-NEXT: # %bb.2: # %else +; SKX-NEXT: testb $2, %al +; SKX-NEXT: jne .LBB52_3 +; SKX-NEXT: .LBB52_4: # %else2 +; SKX-NEXT: retq +; SKX-NEXT: .LBB52_1: # %cond.store +; SKX-NEXT: vmovq %xmm1, %rcx +; SKX-NEXT: vmovlps %xmm0, (%rcx) +; SKX-NEXT: testb $2, %al +; SKX-NEXT: je .LBB52_4 +; SKX-NEXT: .LBB52_3: # %cond.store1 +; SKX-NEXT: vpextrq $1, %xmm1, %rax +; SKX-NEXT: vmovhps %xmm0, (%rax) ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_scatter_2i32_index: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 -; SKX_32-NEXT: vpmovq2m %xmm2, %k1 -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vscatterdpd %xmm0, (%eax,%xmm1,8) {%k1} +; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: vpslld $3, %xmm1, %xmm1 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm1, %xmm1 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB52_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB52_3 +; SKX_32-NEXT: .LBB52_4: # %else2 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB52_1: # %cond.store +; SKX_32-NEXT: vmovd %xmm1, %ecx +; SKX_32-NEXT: vmovlps %xmm0, (%ecx) +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB52_4 +; SKX_32-NEXT: .LBB52_3: # %cond.store1 +; SKX_32-NEXT: vpextrd $1, %xmm1, %eax +; SKX_32-NEXT: vmovhps %xmm0, (%eax) ; SKX_32-NEXT: retl %gep = getelementptr double, double *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask) @@ -3148,12 +4312,27 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4294967294,u,u,u,u,u,u> +; KNL_64-NEXT: vmovq %rdi, %xmm0 +; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0 +; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; KNL_64-NEXT: kmovw %k0, %eax ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_64-NEXT: vpgatherdq (%rdi,%ymm1,8), %zmm0 {%k1} -; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB58_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB58_3 +; KNL_64-NEXT: .LBB58_4: # %else2 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB58_1: # %cond.load +; KNL_64-NEXT: vmovq %xmm1, %rcx +; KNL_64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB58_4 +; KNL_64-NEXT: .LBB58_3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm1, %rax +; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; @@ -3161,43 +4340,105 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4294967294,u,u,u,u,u,u> +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; KNL_32-NEXT: kmovw %k0, %eax ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_32-NEXT: vpgatherdq (%eax,%ymm1,8), %zmm0 {%k1} -; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB58_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB58_3 +; KNL_32-NEXT: .LBB58_4: # %else2 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB58_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm1, %ecx +; KNL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB58_4 +; KNL_32-NEXT: .LBB58_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm1, %eax +; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 +; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX_SMALL-LABEL: gather_2i64_constant_indices: ; SKX_SMALL: # %bb.0: ; SKX_SMALL-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k1 -; SKX_SMALL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4294967294,u,u> +; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0 +; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0 +; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; SKX_SMALL-NEXT: kmovw %k0, %eax ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_SMALL-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} +; SKX_SMALL-NEXT: testb $1, %al +; SKX_SMALL-NEXT: jne .LBB58_1 +; SKX_SMALL-NEXT: # %bb.2: # %else +; SKX_SMALL-NEXT: testb $2, %al +; SKX_SMALL-NEXT: jne .LBB58_3 +; SKX_SMALL-NEXT: .LBB58_4: # %else2 +; SKX_SMALL-NEXT: retq +; SKX_SMALL-NEXT: .LBB58_1: # %cond.load +; SKX_SMALL-NEXT: vmovq %xmm1, %rcx +; SKX_SMALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; SKX_SMALL-NEXT: testb $2, %al +; SKX_SMALL-NEXT: je .LBB58_4 +; SKX_SMALL-NEXT: .LBB58_3: # %cond.load1 +; SKX_SMALL-NEXT: vpextrq $1, %xmm1, %rax +; SKX_SMALL-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 ; SKX_SMALL-NEXT: retq ; ; SKX_LARGE-LABEL: gather_2i64_constant_indices: ; SKX_LARGE: # %bb.0: ; SKX_LARGE-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k1 +; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k0 +; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; SKX_LARGE-NEXT: vmovdqa (%rax), %xmm1 +; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm1 +; SKX_LARGE-NEXT: kmovw %k0, %eax ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_LARGE-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} +; SKX_LARGE-NEXT: testb $1, %al +; SKX_LARGE-NEXT: jne .LBB58_1 +; SKX_LARGE-NEXT: # %bb.2: # %else +; SKX_LARGE-NEXT: testb $2, %al +; SKX_LARGE-NEXT: jne .LBB58_3 +; SKX_LARGE-NEXT: .LBB58_4: # %else2 +; SKX_LARGE-NEXT: retq +; SKX_LARGE-NEXT: .LBB58_1: # %cond.load +; SKX_LARGE-NEXT: vmovq %xmm1, %rcx +; SKX_LARGE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; SKX_LARGE-NEXT: testb $2, %al +; SKX_LARGE-NEXT: je .LBB58_4 +; SKX_LARGE-NEXT: .LBB58_3: # %cond.load1 +; SKX_LARGE-NEXT: vpextrq $1, %xmm1, %rax +; SKX_LARGE-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 ; SKX_LARGE-NEXT: retq ; ; SKX_32-LABEL: gather_2i64_constant_indices: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX_32-NEXT: vpmovq2m %xmm0, %k1 -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4294967294,u,u> +; SKX_32-NEXT: vpmovq2m %xmm0, %k0 +; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; SKX_32-NEXT: kmovw %k0, %eax ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_32-NEXT: vpgatherdq (%eax,%xmm1,8), %xmm0 {%k1} +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB58_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB58_3 +; SKX_32-NEXT: .LBB58_4: # %else2 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB58_1: # %cond.load +; SKX_32-NEXT: vmovd %xmm1, %ecx +; SKX_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB58_4 +; SKX_32-NEXT: .LBB58_3: # %cond.load1 +; SKX_32-NEXT: vpextrd $1, %xmm1, %eax +; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 +; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 ; SKX_32-NEXT: retl %gep = getelementptr i64, i64* %ptr, <2 x i64> %res = tail call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep, i32 8, <2 x i1> %mask, <2 x i64> zeroinitializer) #1 @@ -3265,53 +4506,128 @@ define void @scatter_2i64_constant_indices(i32* %ptr, <2 x i1> %mask, <2 x i32> %src0) { ; KNL_64-LABEL: scatter_2i64_constant_indices: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL_64-NEXT: kshiftlw $14, %k0, %k0 -; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,4294967294,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} +; KNL_64-NEXT: vmovq %rdi, %xmm0 +; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0 +; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: jne .LBB60_1 +; KNL_64-NEXT: # %bb.2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: jne .LBB60_3 +; KNL_64-NEXT: .LBB60_4: # %else2 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB60_1: # %cond.store +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vmovss %xmm1, (%rcx) +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB60_4 +; KNL_64-NEXT: .LBB60_3: # %cond.store1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vextractps $1, %xmm1, (%rax) ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: scatter_2i64_constant_indices: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL_32-NEXT: kshiftlw $14, %k0, %k0 -; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,4294967294,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB60_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB60_3 +; KNL_32-NEXT: .LBB60_4: # %else2 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB60_1: # %cond.store +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vmovss %xmm1, (%ecx) +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB60_4 +; KNL_32-NEXT: .LBB60_3: # %cond.store1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %eax +; KNL_32-NEXT: vextractps $1, %xmm1, (%eax) ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; ; SKX_SMALL-LABEL: scatter_2i64_constant_indices: ; SKX_SMALL: # %bb.0: ; SKX_SMALL-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k1 -; SKX_SMALL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4294967294,u,u> -; SKX_SMALL-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} +; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0 +; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0 +; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX_SMALL-NEXT: kmovw %k0, %eax +; SKX_SMALL-NEXT: testb $1, %al +; SKX_SMALL-NEXT: jne .LBB60_1 +; SKX_SMALL-NEXT: # %bb.2: # %else +; SKX_SMALL-NEXT: testb $2, %al +; SKX_SMALL-NEXT: jne .LBB60_3 +; SKX_SMALL-NEXT: .LBB60_4: # %else2 +; SKX_SMALL-NEXT: retq +; SKX_SMALL-NEXT: .LBB60_1: # %cond.store +; SKX_SMALL-NEXT: vmovq %xmm0, %rcx +; SKX_SMALL-NEXT: vmovss %xmm1, (%rcx) +; SKX_SMALL-NEXT: testb $2, %al +; SKX_SMALL-NEXT: je .LBB60_4 +; SKX_SMALL-NEXT: .LBB60_3: # %cond.store1 +; SKX_SMALL-NEXT: vpextrq $1, %xmm0, %rax +; SKX_SMALL-NEXT: vextractps $1, %xmm1, (%rax) ; SKX_SMALL-NEXT: retq ; ; SKX_LARGE-LABEL: scatter_2i64_constant_indices: ; SKX_LARGE: # %bb.0: ; SKX_LARGE-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k1 +; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k0 +; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; SKX_LARGE-NEXT: vmovdqa (%rax), %xmm0 -; SKX_LARGE-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} +; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm0 +; SKX_LARGE-NEXT: kmovw %k0, %eax +; SKX_LARGE-NEXT: testb $1, %al +; SKX_LARGE-NEXT: jne .LBB60_1 +; SKX_LARGE-NEXT: # %bb.2: # %else +; SKX_LARGE-NEXT: testb $2, %al +; SKX_LARGE-NEXT: jne .LBB60_3 +; SKX_LARGE-NEXT: .LBB60_4: # %else2 +; SKX_LARGE-NEXT: retq +; SKX_LARGE-NEXT: .LBB60_1: # %cond.store +; SKX_LARGE-NEXT: vmovq %xmm0, %rcx +; SKX_LARGE-NEXT: vmovss %xmm1, (%rcx) +; SKX_LARGE-NEXT: testb $2, %al +; SKX_LARGE-NEXT: je .LBB60_4 +; SKX_LARGE-NEXT: .LBB60_3: # %cond.store1 +; SKX_LARGE-NEXT: vpextrq $1, %xmm0, %rax +; SKX_LARGE-NEXT: vextractps $1, %xmm1, (%rax) ; SKX_LARGE-NEXT: retq ; ; SKX_32-LABEL: scatter_2i64_constant_indices: ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX_32-NEXT: vpmovq2m %xmm0, %k1 -; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4294967294,u,u> -; SKX_32-NEXT: vpscatterdd %xmm1, (%eax,%xmm0,4) {%k1} +; SKX_32-NEXT: vpmovq2m %xmm0, %k0 +; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %eax +; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: jne .LBB60_1 +; SKX_32-NEXT: # %bb.2: # %else +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: jne .LBB60_3 +; SKX_32-NEXT: .LBB60_4: # %else2 +; SKX_32-NEXT: retl +; SKX_32-NEXT: .LBB60_1: # %cond.store +; SKX_32-NEXT: vmovd %xmm0, %ecx +; SKX_32-NEXT: vmovss %xmm1, (%ecx) +; SKX_32-NEXT: testb $2, %al +; SKX_32-NEXT: je .LBB60_4 +; SKX_32-NEXT: .LBB60_3: # %cond.store1 +; SKX_32-NEXT: vpextrd $1, %xmm0, %eax +; SKX_32-NEXT: vextractps $1, %xmm1, (%eax) ; SKX_32-NEXT: retl %gep = getelementptr i32, i32* %ptr, <2 x i64> call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %src0, <2 x i32*> %gep, i32 4, <2 x i1> %mask) @@ -3379,27 +4695,84 @@ define <4 x i32> @splat_ptr_gather(i32* %ptr, <4 x i1> %mask, <4 x i32> %passthru) { ; KNL_64-LABEL: splat_ptr_gather: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_64-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL_64-NEXT: kshiftlw $12, %k0, %k0 -; KNL_64-NEXT: kshiftrw $12, %k0, %k1 -; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovq %rdi, %xmm0 +; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: je .LBB62_2 +; KNL_64-NEXT: # %bb.1: # %cond.load +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm1 +; KNL_64-NEXT: .LBB62_2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB62_4 +; KNL_64-NEXT: # %bb.3: # %cond.load1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1 +; KNL_64-NEXT: .LBB62_4: # %else2 +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL_64-NEXT: jne .LBB62_5 +; KNL_64-NEXT: # %bb.6: # %else5 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: jne .LBB62_7 +; KNL_64-NEXT: .LBB62_8: # %else8 +; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB62_5: # %cond.load4 +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm1 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: je .LBB62_8 +; KNL_64-NEXT: .LBB62_7: # %cond.load7 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vpinsrd $3, (%rax), %xmm1, %xmm1 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: splat_ptr_gather: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL_32-NEXT: kshiftlw $12, %k0, %k0 -; KNL_32-NEXT: kshiftrw $12, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB62_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB62_3 +; KNL_32-NEXT: .LBB62_4: # %else2 +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: jne .LBB62_5 +; KNL_32-NEXT: .LBB62_6: # %else5 +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: jne .LBB62_7 +; KNL_32-NEXT: .LBB62_8: # %else8 +; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB62_1: # %cond.load +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm1, %xmm1 +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB62_4 +; KNL_32-NEXT: .LBB62_3: # %cond.load1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx +; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm1, %xmm1 +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: je .LBB62_6 +; KNL_32-NEXT: .LBB62_5: # %cond.load4 +; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx +; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm1 +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: je .LBB62_8 +; KNL_32-NEXT: .LBB62_7: # %cond.load7 +; KNL_32-NEXT: vpextrd $3, %xmm0, %eax +; KNL_32-NEXT: vpinsrd $3, (%eax), %xmm1, %xmm1 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -3432,26 +4805,81 @@ define void @splat_ptr_scatter(i32* %ptr, <4 x i1> %mask, <4 x i32> %val) { ; KNL_64-LABEL: splat_ptr_scatter: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_64-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL_64-NEXT: kshiftlw $12, %k0, %k0 -; KNL_64-NEXT: kshiftrw $12, %k0, %k1 -; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} +; KNL_64-NEXT: vmovq %rdi, %xmm0 +; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL_64-NEXT: kmovw %k0, %eax +; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: je .LBB63_2 +; KNL_64-NEXT: # %bb.1: # %cond.store +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vmovss %xmm1, (%rcx) +; KNL_64-NEXT: .LBB63_2: # %else +; KNL_64-NEXT: testb $2, %al +; KNL_64-NEXT: je .LBB63_4 +; KNL_64-NEXT: # %bb.3: # %cond.store1 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vextractps $1, %xmm1, (%rcx) +; KNL_64-NEXT: .LBB63_4: # %else2 +; KNL_64-NEXT: testb $4, %al +; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL_64-NEXT: jne .LBB63_5 +; KNL_64-NEXT: # %bb.6: # %else4 +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: jne .LBB63_7 +; KNL_64-NEXT: .LBB63_8: # %else6 +; KNL_64-NEXT: vzeroupper +; KNL_64-NEXT: retq +; KNL_64-NEXT: .LBB63_5: # %cond.store3 +; KNL_64-NEXT: vmovq %xmm0, %rcx +; KNL_64-NEXT: vextractps $2, %xmm1, (%rcx) +; KNL_64-NEXT: testb $8, %al +; KNL_64-NEXT: je .LBB63_8 +; KNL_64-NEXT: .LBB63_7: # %cond.store5 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rax +; KNL_64-NEXT: vextractps $3, %xmm1, (%rax) ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: splat_ptr_scatter: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL_32-NEXT: kshiftlw $12, %k0, %k0 -; KNL_32-NEXT: kshiftrw $12, %k0, %k1 -; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: jne .LBB63_1 +; KNL_32-NEXT: # %bb.2: # %else +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: jne .LBB63_3 +; KNL_32-NEXT: .LBB63_4: # %else2 +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: jne .LBB63_5 +; KNL_32-NEXT: .LBB63_6: # %else4 +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: jne .LBB63_7 +; KNL_32-NEXT: .LBB63_8: # %else6 +; KNL_32-NEXT: vzeroupper +; KNL_32-NEXT: retl +; KNL_32-NEXT: .LBB63_1: # %cond.store +; KNL_32-NEXT: vmovd %xmm0, %ecx +; KNL_32-NEXT: vmovss %xmm1, (%ecx) +; KNL_32-NEXT: testb $2, %al +; KNL_32-NEXT: je .LBB63_4 +; KNL_32-NEXT: .LBB63_3: # %cond.store1 +; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx +; KNL_32-NEXT: vextractps $1, %xmm1, (%ecx) +; KNL_32-NEXT: testb $4, %al +; KNL_32-NEXT: je .LBB63_6 +; KNL_32-NEXT: .LBB63_5: # %cond.store3 +; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx +; KNL_32-NEXT: vextractps $2, %xmm1, (%ecx) +; KNL_32-NEXT: testb $8, %al +; KNL_32-NEXT: je .LBB63_8 +; KNL_32-NEXT: .LBB63_7: # %cond.store5 +; KNL_32-NEXT: vpextrd $3, %xmm0, %eax +; KNL_32-NEXT: vextractps $3, %xmm1, (%eax) ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -7,21 +7,59 @@ ; WIDEN_SKX-LABEL: test_gather_v2i32_index: ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k1 -; WIDEN_SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1} -; WIDEN_SKX-NEXT: vmovapd %xmm2, %xmm0 +; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k0 +; WIDEN_SKX-NEXT: vpbroadcastq %rdi, %xmm1 +; WIDEN_SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; WIDEN_SKX-NEXT: vpsllq $3, %xmm0, %xmm0 +; WIDEN_SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_SKX-NEXT: kmovw %k0, %eax +; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: jne .LBB0_1 +; WIDEN_SKX-NEXT: # %bb.2: # %else +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: jne .LBB0_3 +; WIDEN_SKX-NEXT: .LBB0_4: # %else2 +; WIDEN_SKX-NEXT: vmovaps %xmm2, %xmm0 +; WIDEN_SKX-NEXT: retq +; WIDEN_SKX-NEXT: .LBB0_1: # %cond.load +; WIDEN_SKX-NEXT: vmovq %xmm0, %rcx +; WIDEN_SKX-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: je .LBB0_4 +; WIDEN_SKX-NEXT: .LBB0_3: # %cond.load1 +; WIDEN_SKX-NEXT: vpextrq $1, %xmm0, %rax +; WIDEN_SKX-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; WIDEN_SKX-NEXT: vmovaps %xmm2, %xmm0 ; WIDEN_SKX-NEXT: retq ; ; WIDEN_KNL-LABEL: test_gather_v2i32_index: ; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; WIDEN_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; WIDEN_KNL-NEXT: kshiftlw $14, %k0, %k0 -; WIDEN_KNL-NEXT: kshiftrw $14, %k0, %k1 -; WIDEN_KNL-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} -; WIDEN_KNL-NEXT: vmovapd %xmm2, %xmm0 +; WIDEN_KNL-NEXT: vpmovsxdq %xmm0, %xmm0 +; WIDEN_KNL-NEXT: vpsllq $3, %xmm0, %xmm0 +; WIDEN_KNL-NEXT: vmovq %rdi, %xmm1 +; WIDEN_KNL-NEXT: vpbroadcastq %xmm1, %xmm1 +; WIDEN_KNL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_KNL-NEXT: kmovw %k0, %eax +; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: jne .LBB0_1 +; WIDEN_KNL-NEXT: # %bb.2: # %else +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: jne .LBB0_3 +; WIDEN_KNL-NEXT: .LBB0_4: # %else2 +; WIDEN_KNL-NEXT: vmovaps %xmm2, %xmm0 +; WIDEN_KNL-NEXT: vzeroupper +; WIDEN_KNL-NEXT: retq +; WIDEN_KNL-NEXT: .LBB0_1: # %cond.load +; WIDEN_KNL-NEXT: vmovq %xmm0, %rcx +; WIDEN_KNL-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: je .LBB0_4 +; WIDEN_KNL-NEXT: .LBB0_3: # %cond.load1 +; WIDEN_KNL-NEXT: vpextrq $1, %xmm0, %rax +; WIDEN_KNL-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; WIDEN_KNL-NEXT: vmovaps %xmm2, %xmm0 ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; @@ -40,19 +78,55 @@ ; WIDEN_SKX-LABEL: test_scatter_v2i32_index: ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k1 -; WIDEN_SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1} +; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k0 +; WIDEN_SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; WIDEN_SKX-NEXT: vpmovsxdq %xmm1, %xmm1 +; WIDEN_SKX-NEXT: vpsllq $3, %xmm1, %xmm1 +; WIDEN_SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_SKX-NEXT: kmovw %k0, %eax +; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: jne .LBB1_1 +; WIDEN_SKX-NEXT: # %bb.2: # %else +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: jne .LBB1_3 +; WIDEN_SKX-NEXT: .LBB1_4: # %else2 +; WIDEN_SKX-NEXT: retq +; WIDEN_SKX-NEXT: .LBB1_1: # %cond.store +; WIDEN_SKX-NEXT: vmovq %xmm1, %rcx +; WIDEN_SKX-NEXT: vmovlps %xmm0, (%rcx) +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: je .LBB1_4 +; WIDEN_SKX-NEXT: .LBB1_3: # %cond.store1 +; WIDEN_SKX-NEXT: vpextrq $1, %xmm1, %rax +; WIDEN_SKX-NEXT: vmovhps %xmm0, (%rax) ; WIDEN_SKX-NEXT: retq ; ; WIDEN_KNL-LABEL: test_scatter_v2i32_index: ; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; WIDEN_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; WIDEN_KNL-NEXT: kshiftlw $14, %k0, %k0 -; WIDEN_KNL-NEXT: kshiftrw $14, %k0, %k1 -; WIDEN_KNL-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1} +; WIDEN_KNL-NEXT: vpmovsxdq %xmm1, %xmm1 +; WIDEN_KNL-NEXT: vpsllq $3, %xmm1, %xmm1 +; WIDEN_KNL-NEXT: vmovq %rdi, %xmm2 +; WIDEN_KNL-NEXT: vpbroadcastq %xmm2, %xmm2 +; WIDEN_KNL-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_KNL-NEXT: kmovw %k0, %eax +; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: jne .LBB1_1 +; WIDEN_KNL-NEXT: # %bb.2: # %else +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: jne .LBB1_3 +; WIDEN_KNL-NEXT: .LBB1_4: # %else2 +; WIDEN_KNL-NEXT: vzeroupper +; WIDEN_KNL-NEXT: retq +; WIDEN_KNL-NEXT: .LBB1_1: # %cond.store +; WIDEN_KNL-NEXT: vmovq %xmm1, %rcx +; WIDEN_KNL-NEXT: vmovlps %xmm0, (%rcx) +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: je .LBB1_4 +; WIDEN_KNL-NEXT: .LBB1_3: # %cond.store1 +; WIDEN_KNL-NEXT: vpextrq $1, %xmm1, %rax +; WIDEN_KNL-NEXT: vmovhps %xmm0, (%rax) ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; @@ -90,20 +164,49 @@ ; WIDEN_SKX-LABEL: test_gather_v2i32_data: ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k1 -; WIDEN_SKX-NEXT: vpgatherqd (,%xmm0), %xmm2 {%k1} +; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k0 +; WIDEN_SKX-NEXT: kmovw %k0, %eax +; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: jne .LBB2_1 +; WIDEN_SKX-NEXT: # %bb.2: # %else +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: jne .LBB2_3 +; WIDEN_SKX-NEXT: .LBB2_4: # %else2 +; WIDEN_SKX-NEXT: vmovdqa %xmm2, %xmm0 +; WIDEN_SKX-NEXT: retq +; WIDEN_SKX-NEXT: .LBB2_1: # %cond.load +; WIDEN_SKX-NEXT: vmovq %xmm0, %rcx +; WIDEN_SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: je .LBB2_4 +; WIDEN_SKX-NEXT: .LBB2_3: # %cond.load1 +; WIDEN_SKX-NEXT: vpextrq $1, %xmm0, %rax +; WIDEN_SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; WIDEN_SKX-NEXT: vmovdqa %xmm2, %xmm0 ; WIDEN_SKX-NEXT: retq ; ; WIDEN_KNL-LABEL: test_gather_v2i32_data: ; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; WIDEN_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; WIDEN_KNL-NEXT: kshiftlw $14, %k0, %k0 -; WIDEN_KNL-NEXT: kshiftrw $14, %k0, %k1 -; WIDEN_KNL-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} +; WIDEN_KNL-NEXT: kmovw %k0, %eax +; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: jne .LBB2_1 +; WIDEN_KNL-NEXT: # %bb.2: # %else +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: jne .LBB2_3 +; WIDEN_KNL-NEXT: .LBB2_4: # %else2 +; WIDEN_KNL-NEXT: vmovdqa %xmm2, %xmm0 +; WIDEN_KNL-NEXT: vzeroupper +; WIDEN_KNL-NEXT: retq +; WIDEN_KNL-NEXT: .LBB2_1: # %cond.load +; WIDEN_KNL-NEXT: vmovq %xmm0, %rcx +; WIDEN_KNL-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: je .LBB2_4 +; WIDEN_KNL-NEXT: .LBB2_3: # %cond.load1 +; WIDEN_KNL-NEXT: vpextrq $1, %xmm0, %rax +; WIDEN_KNL-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vmovdqa %xmm2, %xmm0 ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq @@ -123,19 +226,46 @@ ; WIDEN_SKX-LABEL: test_scatter_v2i32_data: ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k1 -; WIDEN_SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} +; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k0 +; WIDEN_SKX-NEXT: kmovw %k0, %eax +; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: jne .LBB3_1 +; WIDEN_SKX-NEXT: # %bb.2: # %else +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: jne .LBB3_3 +; WIDEN_SKX-NEXT: .LBB3_4: # %else2 +; WIDEN_SKX-NEXT: retq +; WIDEN_SKX-NEXT: .LBB3_1: # %cond.store +; WIDEN_SKX-NEXT: vmovq %xmm1, %rcx +; WIDEN_SKX-NEXT: vmovss %xmm0, (%rcx) +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: je .LBB3_4 +; WIDEN_SKX-NEXT: .LBB3_3: # %cond.store1 +; WIDEN_SKX-NEXT: vpextrq $1, %xmm1, %rax +; WIDEN_SKX-NEXT: vextractps $1, %xmm0, (%rax) ; WIDEN_SKX-NEXT: retq ; ; WIDEN_KNL-LABEL: test_scatter_v2i32_data: ; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; WIDEN_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; WIDEN_KNL-NEXT: kshiftlw $14, %k0, %k0 -; WIDEN_KNL-NEXT: kshiftrw $14, %k0, %k1 -; WIDEN_KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; WIDEN_KNL-NEXT: kmovw %k0, %eax +; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: jne .LBB3_1 +; WIDEN_KNL-NEXT: # %bb.2: # %else +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: jne .LBB3_3 +; WIDEN_KNL-NEXT: .LBB3_4: # %else2 +; WIDEN_KNL-NEXT: vzeroupper +; WIDEN_KNL-NEXT: retq +; WIDEN_KNL-NEXT: .LBB3_1: # %cond.store +; WIDEN_KNL-NEXT: vmovq %xmm1, %rcx +; WIDEN_KNL-NEXT: vmovss %xmm0, (%rcx) +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: je .LBB3_4 +; WIDEN_KNL-NEXT: .LBB3_3: # %cond.store1 +; WIDEN_KNL-NEXT: vpextrq $1, %xmm1, %rax +; WIDEN_KNL-NEXT: vextractps $1, %xmm0, (%rax) ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; @@ -167,20 +297,58 @@ ; WIDEN_SKX-LABEL: test_gather_v2i32_data_index: ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k1 -; WIDEN_SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1} +; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k0 +; WIDEN_SKX-NEXT: vpbroadcastq %rdi, %xmm1 +; WIDEN_SKX-NEXT: vpmovsxdq %xmm0, %xmm0 +; WIDEN_SKX-NEXT: vpsllq $2, %xmm0, %xmm0 +; WIDEN_SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_SKX-NEXT: kmovw %k0, %eax +; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: jne .LBB4_1 +; WIDEN_SKX-NEXT: # %bb.2: # %else +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: jne .LBB4_3 +; WIDEN_SKX-NEXT: .LBB4_4: # %else2 +; WIDEN_SKX-NEXT: vmovdqa %xmm2, %xmm0 +; WIDEN_SKX-NEXT: retq +; WIDEN_SKX-NEXT: .LBB4_1: # %cond.load +; WIDEN_SKX-NEXT: vmovq %xmm0, %rcx +; WIDEN_SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: je .LBB4_4 +; WIDEN_SKX-NEXT: .LBB4_3: # %cond.load1 +; WIDEN_SKX-NEXT: vpextrq $1, %xmm0, %rax +; WIDEN_SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; WIDEN_SKX-NEXT: vmovdqa %xmm2, %xmm0 ; WIDEN_SKX-NEXT: retq ; ; WIDEN_KNL-LABEL: test_gather_v2i32_data_index: ; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; WIDEN_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; WIDEN_KNL-NEXT: kshiftlw $14, %k0, %k0 -; WIDEN_KNL-NEXT: kshiftrw $14, %k0, %k1 -; WIDEN_KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} +; WIDEN_KNL-NEXT: vpmovsxdq %xmm0, %xmm0 +; WIDEN_KNL-NEXT: vpsllq $2, %xmm0, %xmm0 +; WIDEN_KNL-NEXT: vmovq %rdi, %xmm1 +; WIDEN_KNL-NEXT: vpbroadcastq %xmm1, %xmm1 +; WIDEN_KNL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_KNL-NEXT: kmovw %k0, %eax +; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: jne .LBB4_1 +; WIDEN_KNL-NEXT: # %bb.2: # %else +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: jne .LBB4_3 +; WIDEN_KNL-NEXT: .LBB4_4: # %else2 +; WIDEN_KNL-NEXT: vmovdqa %xmm2, %xmm0 +; WIDEN_KNL-NEXT: vzeroupper +; WIDEN_KNL-NEXT: retq +; WIDEN_KNL-NEXT: .LBB4_1: # %cond.load +; WIDEN_KNL-NEXT: vmovq %xmm0, %rcx +; WIDEN_KNL-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: je .LBB4_4 +; WIDEN_KNL-NEXT: .LBB4_3: # %cond.load1 +; WIDEN_KNL-NEXT: vpextrq $1, %xmm0, %rax +; WIDEN_KNL-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vmovdqa %xmm2, %xmm0 ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq @@ -201,19 +369,55 @@ ; WIDEN_SKX-LABEL: test_scatter_v2i32_data_index: ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k1 -; WIDEN_SKX-NEXT: vpscatterdd %xmm0, (%rdi,%xmm1,4) {%k1} +; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k0 +; WIDEN_SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; WIDEN_SKX-NEXT: vpmovsxdq %xmm1, %xmm1 +; WIDEN_SKX-NEXT: vpsllq $2, %xmm1, %xmm1 +; WIDEN_SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_SKX-NEXT: kmovw %k0, %eax +; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: jne .LBB5_1 +; WIDEN_SKX-NEXT: # %bb.2: # %else +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: jne .LBB5_3 +; WIDEN_SKX-NEXT: .LBB5_4: # %else2 +; WIDEN_SKX-NEXT: retq +; WIDEN_SKX-NEXT: .LBB5_1: # %cond.store +; WIDEN_SKX-NEXT: vmovq %xmm1, %rcx +; WIDEN_SKX-NEXT: vmovss %xmm0, (%rcx) +; WIDEN_SKX-NEXT: testb $2, %al +; WIDEN_SKX-NEXT: je .LBB5_4 +; WIDEN_SKX-NEXT: .LBB5_3: # %cond.store1 +; WIDEN_SKX-NEXT: vpextrq $1, %xmm1, %rax +; WIDEN_SKX-NEXT: vextractps $1, %xmm0, (%rax) ; WIDEN_SKX-NEXT: retq ; ; WIDEN_KNL-LABEL: test_scatter_v2i32_data_index: ; WIDEN_KNL: # %bb.0: -; WIDEN_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; WIDEN_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; WIDEN_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; WIDEN_KNL-NEXT: kshiftlw $14, %k0, %k0 -; WIDEN_KNL-NEXT: kshiftrw $14, %k0, %k1 -; WIDEN_KNL-NEXT: vpscatterdd %zmm0, (%rdi,%zmm1,4) {%k1} +; WIDEN_KNL-NEXT: vpmovsxdq %xmm1, %xmm1 +; WIDEN_KNL-NEXT: vpsllq $2, %xmm1, %xmm1 +; WIDEN_KNL-NEXT: vmovq %rdi, %xmm2 +; WIDEN_KNL-NEXT: vpbroadcastq %xmm2, %xmm2 +; WIDEN_KNL-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_KNL-NEXT: kmovw %k0, %eax +; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: jne .LBB5_1 +; WIDEN_KNL-NEXT: # %bb.2: # %else +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: jne .LBB5_3 +; WIDEN_KNL-NEXT: .LBB5_4: # %else2 +; WIDEN_KNL-NEXT: vzeroupper +; WIDEN_KNL-NEXT: retq +; WIDEN_KNL-NEXT: .LBB5_1: # %cond.store +; WIDEN_KNL-NEXT: vmovq %xmm1, %rcx +; WIDEN_KNL-NEXT: vmovss %xmm0, (%rcx) +; WIDEN_KNL-NEXT: testb $2, %al +; WIDEN_KNL-NEXT: je .LBB5_4 +; WIDEN_KNL-NEXT: .LBB5_3: # %cond.store1 +; WIDEN_KNL-NEXT: vpextrq $1, %xmm1, %rax +; WIDEN_KNL-NEXT: vextractps $1, %xmm0, (%rax) ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -1993,144 +1993,34 @@ ; FVW2-NEXT: [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]] ; FVW2-NEXT: [[TMP12:%.*]] = shl i64 [[N_VEC]], 4 ; FVW2-NEXT: [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]] -; FVW2-NEXT: [[TMP13:%.*]] = add nsw i64 [[N_VEC]], -4 -; FVW2-NEXT: [[TMP14:%.*]] = lshr exact i64 [[TMP13]], 2 -; FVW2-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[TMP14]], 1 -; FVW2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP15]], 3 -; FVW2-NEXT: [[TMP16:%.*]] = icmp ult i64 [[TMP13]], 12 -; FVW2-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; FVW2: vector.ph.new: -; FVW2-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP15]], 9223372036854775804 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH_NEW]] ], [ [[PTR_IND_3:%.*]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FVW2-NEXT: [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]] -; FVW2-NEXT: [[TMP17:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> -; FVW2-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> -; FVW2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP20]], align 4, !alias.scope !7 -; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP19]], i64 2 -; FVW2-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD16:%.*]] = load <2 x float>, <2 x float>* [[TMP22]], align 4, !alias.scope !7 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD]], <2 x float*> [[TMP17]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16]], <2 x float*> [[TMP18]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[TMP23:%.*]] = bitcast float* [[NEXT_GEP]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD17:%.*]] = load <2 x float>, <2 x float>* [[TMP23]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[NEXT_GEP]], i64 2 -; FVW2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x float>, <2 x float>* [[TMP25]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP17]], i64 1 -; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP18]], i64 1 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17]], <2 x float*> [[TMP26]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18]], <2 x float*> [[TMP27]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 4 -; FVW2-NEXT: [[PTR_IND:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 64 -; FVW2-NEXT: [[NEXT_GEP_1:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT]] -; FVW2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[PTR_IND]], <2 x i64> -; FVW2-NEXT: [[TMP29:%.*]] = getelementptr float, float* [[PTR_IND]], <2 x i64> -; FVW2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_1]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x float>, <2 x float>* [[TMP31]], align 4, !alias.scope !7 -; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 2 -; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD16_1:%.*]] = load <2 x float>, <2 x float>* [[TMP33]], align 4, !alias.scope !7 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_1]], <2 x float*> [[TMP28]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_1]], <2 x float*> [[TMP29]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[TMP34:%.*]] = bitcast float* [[NEXT_GEP_1]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD17_1:%.*]] = load <2 x float>, <2 x float>* [[TMP34]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP35:%.*]] = getelementptr float, float* [[NEXT_GEP_1]], i64 2 -; FVW2-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP35]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD18_1:%.*]] = load <2 x float>, <2 x float>* [[TMP36]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP28]], i64 1 -; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP29]], i64 1 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_1]], <2 x float*> [[TMP37]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_1]], <2 x float*> [[TMP38]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 8 -; FVW2-NEXT: [[PTR_IND_1:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 128 -; FVW2-NEXT: [[NEXT_GEP_2:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_1]] -; FVW2-NEXT: [[TMP39:%.*]] = getelementptr float, float* [[PTR_IND_1]], <2 x i64> -; FVW2-NEXT: [[TMP40:%.*]] = getelementptr float, float* [[PTR_IND_1]], <2 x i64> -; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_2]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP42:%.*]] = bitcast float* [[TMP41]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x float>, <2 x float>* [[TMP42]], align 4, !alias.scope !7 -; FVW2-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP41]], i64 2 -; FVW2-NEXT: [[TMP44:%.*]] = bitcast float* [[TMP43]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD16_2:%.*]] = load <2 x float>, <2 x float>* [[TMP44]], align 4, !alias.scope !7 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_2]], <2 x float*> [[TMP39]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_2]], <2 x float*> [[TMP40]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[TMP45:%.*]] = bitcast float* [[NEXT_GEP_2]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD17_2:%.*]] = load <2 x float>, <2 x float>* [[TMP45]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[NEXT_GEP_2]], i64 2 -; FVW2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD18_2:%.*]] = load <2 x float>, <2 x float>* [[TMP47]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP39]], i64 1 -; FVW2-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP40]], i64 1 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_2]], <2 x float*> [[TMP48]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_2]], <2 x float*> [[TMP49]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 12 -; FVW2-NEXT: [[PTR_IND_2:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 192 -; FVW2-NEXT: [[NEXT_GEP_3:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_2]] -; FVW2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[PTR_IND_2]], <2 x i64> -; FVW2-NEXT: [[TMP51:%.*]] = getelementptr float, float* [[PTR_IND_2]], <2 x i64> -; FVW2-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_3]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP52]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x float>, <2 x float>* [[TMP53]], align 4, !alias.scope !7 -; FVW2-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP52]], i64 2 -; FVW2-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD16_3:%.*]] = load <2 x float>, <2 x float>* [[TMP55]], align 4, !alias.scope !7 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_3]], <2 x float*> [[TMP50]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_3]], <2 x float*> [[TMP51]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[TMP56:%.*]] = bitcast float* [[NEXT_GEP_3]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD17_3:%.*]] = load <2 x float>, <2 x float>* [[TMP56]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP57:%.*]] = getelementptr float, float* [[NEXT_GEP_3]], i64 2 -; FVW2-NEXT: [[TMP58:%.*]] = bitcast float* [[TMP57]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD18_3:%.*]] = load <2 x float>, <2 x float>* [[TMP58]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP50]], i64 1 -; FVW2-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP51]], i64 1 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_3]], <2 x float*> [[TMP59]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_3]], <2 x float*> [[TMP60]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[INDEX_NEXT_3]] = add nuw i64 [[INDEX]], 16 -; FVW2-NEXT: [[PTR_IND_3]] = getelementptr float, float* [[POINTER_PHI]], i64 256 -; FVW2-NEXT: [[NITER_NSUB_3]] = add i64 [[NITER]], -4 -; FVW2-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NSUB_3]], 0 -; FVW2-NEXT: br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] -; FVW2: middle.block.unr-lcssa: -; FVW2-NEXT: [[POINTER_PHI_UNR:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND_3]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; FVW2-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; FVW2: vector.body.epil: -; FVW2-NEXT: [[POINTER_PHI_EPIL:%.*]] = phi float* [ [[PTR_IND_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[POINTER_PHI_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; FVW2-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; FVW2-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[XTRAITER]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; FVW2-NEXT: [[NEXT_GEP_EPIL:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_EPIL]] -; FVW2-NEXT: [[TMP61:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <2 x i64> -; FVW2-NEXT: [[TMP62:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <2 x i64> -; FVW2-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_EPIL]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP64:%.*]] = bitcast float* [[TMP63]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP64]], align 4, !alias.scope !7 -; FVW2-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP63]], i64 2 -; FVW2-NEXT: [[TMP66:%.*]] = bitcast float* [[TMP65]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD16_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP66]], align 4, !alias.scope !7 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_EPIL]], <2 x float*> [[TMP61]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_EPIL]], <2 x float*> [[TMP62]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[TMP67:%.*]] = bitcast float* [[NEXT_GEP_EPIL]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD17_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP67]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP68:%.*]] = getelementptr float, float* [[NEXT_GEP_EPIL]], i64 2 -; FVW2-NEXT: [[TMP69:%.*]] = bitcast float* [[TMP68]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD18_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP69]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP61]], i64 1 -; FVW2-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP62]], i64 1 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_EPIL]], <2 x float*> [[TMP70]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_EPIL]], <2 x float*> [[TMP71]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[INDEX_NEXT_EPIL]] = add nuw i64 [[INDEX_EPIL]], 4 -; FVW2-NEXT: [[PTR_IND_EPIL]] = getelementptr float, float* [[POINTER_PHI_EPIL]], i64 64 -; FVW2-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 -; FVW2-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 -; FVW2-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP16:![0-9]+]] +; FVW2-NEXT: [[TMP13:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> +; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] +; FVW2-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <2 x float>* +; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP16]], align 4, !alias.scope !7 +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 2 +; FVW2-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <2 x float>* +; FVW2-NEXT: [[WIDE_LOAD16:%.*]] = load <2 x float>, <2 x float>* [[TMP18]], align 4, !alias.scope !7 +; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD]], <2 x float*> [[TMP13]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 +; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16]], <2 x float*> [[TMP14]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 +; FVW2-NEXT: [[TMP19:%.*]] = bitcast float* [[NEXT_GEP]] to <2 x float>* +; FVW2-NEXT: [[WIDE_LOAD17:%.*]] = load <2 x float>, <2 x float>* [[TMP19]], align 4, !alias.scope !14 +; FVW2-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[NEXT_GEP]], i64 2 +; FVW2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <2 x float>* +; FVW2-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x float>, <2 x float>* [[TMP21]], align 4, !alias.scope !14 +; FVW2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP13]], i64 1 +; FVW2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP14]], i64 1 +; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17]], <2 x float*> [[TMP22]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 +; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18]], <2 x float*> [[TMP23]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; FVW2-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FVW2-NEXT: [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i64 64 +; FVW2-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; FVW2: middle.block: ; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]] @@ -2142,15 +2032,15 @@ ; FVW2-NEXT: [[PTR_ADDR_012:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[PTR_ADDR_012_PH]], [[FOR_BODY_PREHEADER]] ] ; FVW2-NEXT: [[DEST_ADDR_011:%.*]] = phi float* [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ], [ [[DEST_ADDR_011_PH]], [[FOR_BODY_PREHEADER]] ] ; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP72:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; FVW2-NEXT: store float [[TMP72]], float* [[DEST_ADDR_011]], align 4 -; FVW2-NEXT: [[TMP73:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 +; FVW2-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; FVW2-NEXT: store float [[TMP25]], float* [[DEST_ADDR_011]], align 4 +; FVW2-NEXT: [[TMP26:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 ; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1 -; FVW2-NEXT: store float [[TMP73]], float* [[ARRAYIDX5]], align 4 +; FVW2-NEXT: store float [[TMP26]], float* [[ARRAYIDX5]], align 4 ; FVW2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1 ; FVW2-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16 ; FVW2-NEXT: [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]] -; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -229,33 +229,35 @@ define void @lookahead_external_uses(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2) { ; CHECK-LABEL: @lookahead_external_uses( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 ; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0 ; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0 ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0 -; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 +; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double*> poison, double* [[A]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[A]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> +; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 ; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> , <2 x double> undef) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 ; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; @@ -319,36 +321,38 @@ ; CHECK-LABEL: @lookahead_limit_users_budget( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 +; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0 ; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0 ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double*> poison, double* [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[B]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> +; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> , <2 x double> undef) +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 ; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 -; CHECK-NEXT: store double [[TMP14]], double* [[EXT1:%.*]], align 8 -; CHECK-NEXT: store double [[TMP14]], double* [[EXT2:%.*]], align 8 -; CHECK-NEXT: store double [[TMP14]], double* [[EXT3:%.*]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: store double [[TMP12]], double* [[EXT2:%.*]], align 8 +; CHECK-NEXT: store double [[TMP12]], double* [[EXT3:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll @@ -58,22 +58,22 @@ define void @delete_pointer_bound(float* %a, float* %b, i1 %c) #0 { ; CHECK-LABEL: @delete_pointer_bound( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float*> poison, float* [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float*> [[TMP0]], float* [[B]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, <2 x float*> [[TMP1]], <2 x i64> +; CHECK-NEXT: [[B_10:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 10 +; CHECK-NEXT: [[B_14:%.*]] = getelementptr inbounds float, float* [[B]], i64 14 ; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: else: -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP2]], i32 4, <2 x i1> , <2 x float> undef) -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[SHUFFLE]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[I71:%.*]] = shufflevector <8 x float> undef, <8 x float> [[TMP4]], <8 x i32> -; CHECK-NEXT: call void @use(<8 x float> [[I71]]) +; CHECK-NEXT: [[L0:%.*]] = load float, float* [[B_10]], align 4 +; CHECK-NEXT: [[L1:%.*]] = load float, float* [[B_14]], align 4 +; CHECK-NEXT: [[I2:%.*]] = insertelement <8 x float> undef, float [[L0]], i32 2 +; CHECK-NEXT: [[I3:%.*]] = insertelement <8 x float> [[I2]], float [[L0]], i32 3 +; CHECK-NEXT: [[I4:%.*]] = insertelement <8 x float> [[I3]], float [[L1]], i32 4 +; CHECK-NEXT: [[I7:%.*]] = insertelement <8 x float> [[I4]], float [[L1]], i32 7 +; CHECK-NEXT: call void @use(<8 x float> [[I7]]) ; CHECK-NEXT: ret void ; CHECK: then: ; CHECK-NEXT: [[A_8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 8 ; CHECK-NEXT: store float 0.000000e+00, float* [[A_8]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float*> [[TMP2]], i32 1 -; CHECK-NEXT: [[L6:%.*]] = load float, float* [[TMP5]], align 4 +; CHECK-NEXT: [[L6:%.*]] = load float, float* [[B_14]], align 4 ; CHECK-NEXT: [[A_5:%.*]] = getelementptr inbounds float, float* [[A]], i64 5 ; CHECK-NEXT: store float [[L6]], float* [[A_5]], align 4 ; CHECK-NEXT: [[A_6:%.*]] = getelementptr inbounds float, float* [[A]], i64 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -24,8 +24,11 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @foo( -; AVX-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> , i32 8, <2 x i1> , <2 x i32> undef) -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16 +; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i32 1 +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> ; AVX-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -2,8 +2,8 @@ ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; CHECK-LABEL: @gather_load( @@ -69,44 +69,66 @@ ; AVX-LABEL: @gather_load_2( ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 ; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 ; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 -; AVX-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 +; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 +; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_2( -; AVX2-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> -; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], -; AVX2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 +; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 +; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @gather_load_2( -; AVX512-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 -; AVX512-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> -; AVX512-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], -; AVX512-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: ret void +; AVX512F-LABEL: @gather_load_2( +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 +; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_2( +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], +; AVX512VL-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 %4 = load i32, i32* %3, align 4, !tbaa !2 @@ -175,102 +197,123 @@ ; ; AVX-LABEL: @gather_load_3( ; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 ; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 ; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; AVX-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; AVX-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; AVX-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; AVX-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6 +; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7 +; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], +; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( ; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX2-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX2-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX2-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX2-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX2-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7 +; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], +; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @gather_load_3( -; AVX512-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX512-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX512-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: ret void +; AVX512F-LABEL: @gather_load_3( +; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512F-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 1 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i32 2 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; AVX512F-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], +; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX512F-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX512F-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i32 0 +; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i32 1 +; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i32 2 +; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i32 3 +; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], +; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_3( +; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 +; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> +; AVX512VL-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], +; AVX512VL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; AVX512VL-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX512VL-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 +; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 +; AVX512VL-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX512VL-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 +; AVX512VL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 +; AVX512VL-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512VL-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 +; AVX512VL-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 %4 = add i32 %3, 1 @@ -356,19 +399,12 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_4( -; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] @@ -378,81 +414,109 @@ ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; AVX-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; AVX-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 -; AVX-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 -; AVX-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4 +; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7 +; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX2-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX2-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX2-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX2-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX2-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX2-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX2-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4 +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7 +; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], +; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @gather_load_4( -; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: ret void +; AVX512F-LABEL: @gather_load_4( +; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 +; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 +; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i32 0 +; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i32 0 +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i32 1 +; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i32 2 +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i32 3 +; AVX512F-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], +; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_4( +; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> +; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 +; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 +; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 +; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 +; AVX512VL-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 +; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 +; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 +; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 %t6 = getelementptr inbounds i32, i32* %t1, i64 11 @@ -502,99 +566,213 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2 -; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3 -; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer -; SSE-NEXT: [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]] -; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 -; SSE-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]] -; SSE-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 +; SSE-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; SSE-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; SSE-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i32 1 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i32 2 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i32 3 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i32 1 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i32 2 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i32 3 +; SSE-NEXT: [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]] +; SSE-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; SSE-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; SSE-NEXT: [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; SSE-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP37]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP43]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i32 0 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP34]], i32 1 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP38]], i32 2 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP42]], i32 3 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i32 0 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP36]], i32 1 +; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP40]], i32 2 +; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i32 3 +; SSE-NEXT: [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]] +; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP27]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> -; AVX-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> -; AVX-NEXT: [[TMP8:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> -; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 -; AVX-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> [[TMP9]], float* [[TMP3]], i32 1 -; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> -; AVX-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> [[TMP11]], <8 x i32> -; AVX-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> -; AVX-NEXT: [[TMP15:%.*]] = shufflevector <2 x float*> [[TMP8]], <2 x float*> poison, <8 x i32> -; AVX-NEXT: [[TMP16:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> [[TMP15]], <8 x i32> -; AVX-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX-NEXT: [[TMP19:%.*]] = getelementptr float, <8 x float*> [[TMP18]], <8 x i64> -; AVX-NEXT: [[TMP20:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP19]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP21:%.*]] = fdiv <8 x float> [[TMP17]], [[TMP20]] -; AVX-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX-NEXT: store <8 x float> [[TMP21]], <8 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; AVX-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i32 1 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i32 2 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i32 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i32 4 +; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i32 5 +; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i32 6 +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i32 7 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i32 0 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i32 1 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i32 2 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i32 3 +; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i32 4 +; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i32 5 +; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i32 6 +; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i32 7 +; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] +; AVX-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 -; AVX2-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> -; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> -; AVX2-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1 -; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> -; AVX2-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> -; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> -; AVX2-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> -; AVX2-NEXT: [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]] -; AVX2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX2-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX2-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; AVX2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i32 1 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i32 2 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i32 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i32 4 +; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i32 5 +; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i32 6 +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i32 7 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i32 0 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i32 1 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i32 2 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i32 3 +; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i32 4 +; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i32 5 +; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i32 6 +; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i32 7 +; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] +; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX2-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @gather_load_div( -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 -; AVX512-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> -; AVX512-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> -; AVX512-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1 -; AVX512-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> -; AVX512-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX512-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> -; AVX512-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> -; AVX512-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> -; AVX512-NEXT: [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]] -; AVX512-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX512-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: ret void +; AVX512F-LABEL: @gather_load_div( +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 +; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 +; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1 +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> +; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> +; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> +; AVX512F-NEXT: [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]] +; AVX512F-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512F-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_div( +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 +; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1 +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> +; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> +; AVX512VL-NEXT: [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]] +; AVX512VL-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512VL-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = load float, float* %1, align 4, !tbaa !2 %4 = getelementptr inbounds float, float* %1, i64 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -2,8 +2,8 @@ ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512 -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) { ; CHECK-LABEL: @gather_load( @@ -69,44 +69,66 @@ ; AVX-LABEL: @gather_load_2( ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 ; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 ; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2 -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3 -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4 -; AVX-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 +; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 +; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_2( -; AVX2-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> -; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], -; AVX2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 +; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 +; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @gather_load_2( -; AVX512-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 -; AVX512-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> -; AVX512-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], -; AVX512-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: ret void +; AVX512F-LABEL: @gather_load_2( +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2 +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3 +; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_2( +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i32 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], +; AVX512VL-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, i32* %1, i64 1 %4 = load i32, i32* %3, align 4, !tbaa !2 @@ -175,102 +197,123 @@ ; ; AVX-LABEL: @gather_load_3( ; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 ; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 ; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; AVX-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; AVX-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; AVX-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; AVX-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6 +; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7 +; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], +; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( ; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX2-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX2-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX2-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX2-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX2-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7 +; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], +; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @gather_load_3( -; AVX512-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX512-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX512-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: ret void +; AVX512F-LABEL: @gather_load_3( +; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX512F-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX512F-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 1 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP7]], i32 2 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; AVX512F-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], +; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX512F-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX512F-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX512F-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512F-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i32 0 +; AVX512F-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP20]], i32 1 +; AVX512F-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP22]], i32 2 +; AVX512F-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i32 3 +; AVX512F-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], +; AVX512F-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_3( +; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 +; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> +; AVX512VL-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], +; AVX512VL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; AVX512VL-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX512VL-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 +; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 +; AVX512VL-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX512VL-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 +; AVX512VL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 +; AVX512VL-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX512VL-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 +; AVX512VL-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 %4 = add i32 %3, 1 @@ -356,19 +399,12 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_4( -; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] @@ -378,81 +414,109 @@ ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; AVX-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; AVX-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 -; AVX-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 -; AVX-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4 +; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7 +; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX2-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX2-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX2-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX2-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX2-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX2-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX2-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0 +; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4 +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7 +; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], +; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX2-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; -; AVX512-LABEL: @gather_load_4( -; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 -; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] -; AVX512-NEXT: ret void +; AVX512F-LABEL: @gather_load_4( +; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 +; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 +; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i32 0 +; AVX512F-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX512F-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i32 0 +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T23]], i32 1 +; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T27]], i32 2 +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T31]], i32 3 +; AVX512F-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], +; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* +; AVX512F-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_4( +; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i32 0 +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> +; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 +; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 +; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 +; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 +; AVX512VL-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 +; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 +; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 +; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* +; AVX512VL-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 %t6 = getelementptr inbounds i32, i32* %t1, i64 11 @@ -502,77 +566,214 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( -; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2 -; SSE-NEXT: [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3 -; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer -; SSE-NEXT: [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]] -; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 -; SSE-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> -; SSE-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> , <4 x float> undef), !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]] -; SSE-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 +; SSE-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; SSE-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; SSE-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i32 1 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i32 2 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i32 3 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i32 1 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i32 2 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i32 3 +; SSE-NEXT: [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]] +; SSE-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; SSE-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; SSE-NEXT: [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; SSE-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP37]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP43]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i32 0 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP34]], i32 1 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP38]], i32 2 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP42]], i32 3 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i32 0 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP36]], i32 1 +; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP40]], i32 2 +; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i32 3 +; SSE-NEXT: [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]] +; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP27]] to <4 x float>* +; SSE-NEXT: store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( -; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> -; AVX-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> -; AVX-NEXT: [[TMP8:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> -; AVX-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 -; AVX-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> [[TMP9]], float* [[TMP3]], i32 1 -; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> -; AVX-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> [[TMP11]], <8 x i32> -; AVX-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> -; AVX-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> -; AVX-NEXT: [[TMP15:%.*]] = shufflevector <2 x float*> [[TMP8]], <2 x float*> poison, <8 x i32> -; AVX-NEXT: [[TMP16:%.*]] = shufflevector <8 x float*> [[TMP14]], <8 x float*> [[TMP15]], <8 x i32> -; AVX-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX-NEXT: [[TMP19:%.*]] = getelementptr float, <8 x float*> [[TMP18]], <8 x i64> -; AVX-NEXT: [[TMP20:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP19]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP21:%.*]] = fdiv <8 x float> [[TMP17]], [[TMP20]] -; AVX-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX-NEXT: store <8 x float> [[TMP21]], <8 x float>* [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; AVX-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i32 1 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i32 2 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i32 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i32 4 +; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i32 5 +; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i32 6 +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i32 7 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i32 0 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i32 1 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i32 2 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i32 3 +; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i32 4 +; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i32 5 +; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i32 6 +; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i32 7 +; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] +; AVX-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 -; AVX2-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> -; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> -; AVX2-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1 -; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> -; AVX2-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> -; AVX2-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> -; AVX2-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> -; AVX2-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> -; AVX2-NEXT: [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]] -; AVX2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* -; AVX2-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX2-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; AVX2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i32 1 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i32 2 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i32 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i32 4 +; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i32 5 +; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i32 6 +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i32 7 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i32 0 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i32 1 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i32 2 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i32 3 +; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i32 4 +; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i32 5 +; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i32 6 +; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i32 7 +; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] +; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX2-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; +; AVX512F-LABEL: @gather_load_div( +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 +; AVX512F-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 +; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> +; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> +; AVX512F-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1 +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> +; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> +; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> +; AVX512F-NEXT: [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]] +; AVX512F-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512F-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: ret void +; +; AVX512VL-LABEL: @gather_load_div( +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 +; AVX512VL-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> +; AVX512VL-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0 +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1 +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> +; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> +; AVX512VL-NEXT: [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]] +; AVX512VL-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* +; AVX512VL-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: ret void +; ; AVX512-LABEL: @gather_load_div( ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10 ; AVX512-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0 @@ -595,7 +796,6 @@ ; AVX512-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]] ; AVX512-NEXT: ret void -; %3 = load float, float* %1, align 4, !tbaa !2 %4 = getelementptr inbounds float, float* %1, i64 4 %5 = load float, float* %4, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll @@ -5,23 +5,27 @@ ; CHECK-LABEL: @non-ordered-stores( ; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0 ; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 +; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32*> poison, i32* [[IN_ADDR]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32*> [[TMP1]], i32* [[IN_ADDR]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, <2 x i32*> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 +; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 ; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 +; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 +; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 ; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32*> poison, i32* [[INN_ADDR]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32*> [[TMP5]], i32* [[INN_ADDR]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP7]], i32 4, <2 x i1> , <2 x i32> undef) +; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 +; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 ; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]] ; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_4]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_8]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]] ; CHECK-NEXT: br label [[BLOCK1:%.*]] ; CHECK: block1: ; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 5 @@ -36,8 +40,8 @@ ; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_10]], align 4 ; CHECK-NEXT: store i32 [[LOAD_9]], i32* [[GEP_9]], align 4 ; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_11]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP9]], <2 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0