Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -1213,13 +1213,15 @@ /// reduce runtime. virtual bool ShouldShrinkFPConstant(EVT) const { return true; } - // Return true if it is profitable to reduce the given load node to a smaller - // type. - // - // e.g. (i16 (trunc (i32 (load x))) -> i16 load x should be performed - virtual bool shouldReduceLoadWidth(SDNode *Load, - ISD::LoadExtType ExtTy, + /// Return true if it is profitable to reduce a load to a smaller type. + /// Example: (i16 (trunc (i32 (load x))) -> i16 load x + virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { + // By default, assume that it is cheaper to extract a subvector from a wide + // vector load rather than creating multiple narrow vector loads. + if (NewVT.isVector() && !Load->hasOneUse()) + return false; + return true; } Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16713,17 +16713,19 @@ if (DAG.getDataLayout().isBigEndian()) return SDValue(); - // TODO: The one-use check is overly conservative. Check the cost of the - // extract instead or remove that condition entirely. auto *Ld = dyn_cast(Extract->getOperand(0)); auto *ExtIdx = dyn_cast(Extract->getOperand(1)); - if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() || - !ExtIdx) + if (!Ld || Ld->getExtensionType() || Ld->isVolatile() || !ExtIdx) + return SDValue(); + + // Allow targets to opt-out. + EVT VT = Extract->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT)) return SDValue(); // The narrow load will be offset from the base address of the old load if // we are extracting from something besides index 0 (little-endian). - EVT VT = Extract->getValueType(0); SDLoc DL(Extract); SDValue BaseAddr = Ld->getOperand(1); unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8084,6 +8084,10 @@ bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { + // TODO: This may be worth removing. Check regression tests for diffs. + if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) + return false; + // If we're reducing the load width in order to avoid having to use an extra // instruction to do extension then it's probably a good idea. if (ExtTy != ISD::NON_EXTLOAD) Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -655,8 +655,11 @@ } bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, - ISD::LoadExtType, + ISD::LoadExtType ExtTy, EVT NewVT) const { + // TODO: This may be worth removing. Check regression tests for diffs. + if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT)) + return false; unsigned NewSize = NewVT.getStoreSizeInBits(); Index: lib/Target/Hexagon/HexagonISelLowering.cpp =================================================================== --- lib/Target/Hexagon/HexagonISelLowering.cpp +++ lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3082,6 +3082,10 @@ bool HexagonTargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { + // TODO: This may be worth removing. Check regression tests for diffs. + if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT)) + return false; + auto *L = cast(Load); std::pair BO = getBaseAndOffset(L->getBasePtr()); // Small-data object, do not shrink. Index: test/CodeGen/X86/2012-01-12-extract-sv.ll =================================================================== --- test/CodeGen/X86/2012-01-12-extract-sv.ll +++ test/CodeGen/X86/2012-01-12-extract-sv.ll @@ -4,8 +4,7 @@ define void @endless_loop() { ; CHECK-LABEL: endless_loop: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps (%eax), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps (%eax), %xmm0 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -100,13 +100,11 @@ ; ; AVX1-LABEL: avg_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa (%rsi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -268,94 +266,97 @@ ; ; AVX1-LABEL: avg_v48i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm2 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX1-NEXT: vmovdqa (%rsi), %ymm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3] +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm9 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm8 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm10 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm11, %xmm11 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] +; AVX1-NEXT: vpaddd %xmm2, %xmm7, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm12, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm5, %xmm13, %xmm13 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm8 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm15 +; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm10 -; AVX1-NEXT: vpsubd %xmm7, %xmm9, %xmm9 -; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpsubd %xmm7, %xmm11, %xmm11 -; AVX1-NEXT: vpsubd %xmm7, %xmm12, %xmm12 -; AVX1-NEXT: vpsubd %xmm7, %xmm13, %xmm5 -; AVX1-NEXT: vpsubd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm7, %xmm15, %xmm1 -; AVX1-NEXT: vpsubd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13 +; AVX1-NEXT: vmovdqa (%rsi), %xmm7 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm12, %xmm12 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm7, %xmm11, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm15, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm0, %xmm10, %xmm10 +; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm9 +; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm8 +; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm11 +; AVX1-NEXT: vpsubd %xmm0, %xmm14, %xmm14 +; AVX1-NEXT: vpsubd %xmm0, %xmm13, %xmm5 +; AVX1-NEXT: vpsubd %xmm0, %xmm12, %xmm1 +; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm7 +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6 +; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $1, %xmm7, %xmm3 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4 +; AVX1-NEXT: vpsrld $1, %xmm14, %xmm4 ; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrld $1, %xmm11, %xmm4 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm5 @@ -381,67 +382,61 @@ ; ; AVX2-LABEL: avg_v48i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm0[3,1,2,3] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpbroadcastq 24(%rdi), %ymm1 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,2,3] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm2, %ymm7, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = mem[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq 24(%rsi), %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = mem[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = mem[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm7 +; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm0 +; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm0 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm7, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-NEXT: vpackusdw %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vpsrld $1, %ymm7, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-NEXT: vpackusdw %xmm7, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX2-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 @@ -454,16 +449,13 @@ ; ; AVX512F-LABEL: avg_v48i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512F-NEXT: vpavgb %xmm5, %xmm4, %xmm4 -; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb 32(%rsi), %xmm2, %xmm1 ; AVX512F-NEXT: vmovdqu %xmm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper @@ -471,37 +463,31 @@ ; ; AVX512BW-LABEL: avg_v48i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512BW-NEXT: vpaddd %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero -; AVX512BW-NEXT: vpaddd %zmm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsubd %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsubd %zmm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrld $1, %zmm3, %zmm1 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpaddd %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpaddd %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 +; AVX512BW-NEXT: vpsubd %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubd %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsubd %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrld $1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpsrld $1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu %ymm1, (%rax) -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, (%rax) +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm1 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %1 = load <48 x i8>, <48 x i8>* %a @@ -535,20 +521,16 @@ ; ; AVX1-LABEL: avg_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vmovdqa (%rsi), %ymm2 -; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpavgb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpavgb 48(%rdi), %xmm3, %xmm1 +; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -662,13 +644,11 @@ ; ; AVX1-LABEL: avg_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa (%rsi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpavgw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpavgw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -719,20 +699,16 @@ ; ; AVX1-LABEL: avg_v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vmovdqa (%rsi), %ymm2 -; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpavgw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpavgw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpavgw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpavgw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpavgw 48(%rdi), %xmm3, %xmm1 +; AVX1-NEXT: vpavgw 32(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -874,13 +850,11 @@ ; ; AVX1-LABEL: avg_v32i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa (%rsi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -931,16 +905,16 @@ ; ; AVX1-LABEL: avg_v64i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 ; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpavgb %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vpavgb %xmm3, %xmm3, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -1055,13 +1029,11 @@ ; ; AVX1-LABEL: avg_v16i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa (%rsi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpavgw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1112,20 +1084,16 @@ ; ; AVX1-LABEL: avg_v32i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vmovdqa (%rsi), %ymm2 -; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpavgw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpavgw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpavgw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm1 +; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -1254,12 +1222,10 @@ ; ; AVX1-LABEL: avg_v32i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275] +; AVX1-NEXT: # xmm0 = mem[0,0] +; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -1308,20 +1274,16 @@ ; ; AVX1-LABEL: avg_v64i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpavgb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [7.9499288951273625E-275,7.9499288951273625E-275] +; AVX1-NEXT: # xmm0 = mem[0,0] +; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1420,11 +1382,9 @@ ; ; AVX1-LABEL: avg_v16i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -1473,19 +1433,15 @@ ; ; AVX1-LABEL: avg_v32i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpavgw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpavgw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm2 +; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpavgw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpavgw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1727,131 +1683,97 @@ ; AVX1-NEXT: pushq %rbp ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp -; AVX1-NEXT: subq $128, %rsp +; AVX1-NEXT: subq $96, %rsp ; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: vmovdqa 144(%rbp), %ymm8 -; AVX1-NEXT: vmovdqa 112(%rbp), %ymm9 -; AVX1-NEXT: vmovdqa 80(%rbp), %ymm10 -; AVX1-NEXT: vmovdqa 48(%rbp), %ymm11 -; AVX1-NEXT: vmovdqa 16(%rbp), %ymm12 -; AVX1-NEXT: vmovdqa 272(%rbp), %ymm13 -; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm14 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm15 -; AVX1-NEXT: vpavgb %xmm14, %xmm15, %xmm14 -; AVX1-NEXT: vmovdqa 304(%rbp), %ymm15 -; AVX1-NEXT: vpavgb %xmm13, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm14 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vpavgb %xmm14, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa 336(%rbp), %ymm14 -; AVX1-NEXT: vpavgb %xmm15, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpavgb 272(%rbp), %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpavgb 288(%rbp), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa 368(%rbp), %ymm1 -; AVX1-NEXT: vpavgb %xmm14, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vpavgb 304(%rbp), %xmm1, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpavgb 320(%rbp), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa 400(%rbp), %ymm2 -; AVX1-NEXT: vpavgb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm1 -; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa 432(%rbp), %ymm1 -; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 -; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa 464(%rbp), %ymm2 -; AVX1-NEXT: vpavgb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm5 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa 496(%rbp), %ymm1 -; AVX1-NEXT: vpavgb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm2 -; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa 528(%rbp), %ymm2 -; AVX1-NEXT: vpavgb %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm1 -; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa 560(%rbp), %ymm1 -; AVX1-NEXT: vpavgb %xmm2, %xmm12, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm12 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm2 -; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa 592(%rbp), %ymm2 -; AVX1-NEXT: vpavgb %xmm1, %xmm11, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm11 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm1 -; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa 624(%rbp), %ymm1 -; AVX1-NEXT: vpavgb %xmm2, %xmm10, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm10 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm2 -; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa 656(%rbp), %ymm2 -; AVX1-NEXT: vpavgb %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 -; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa 176(%rbp), %ymm1 -; AVX1-NEXT: vpavgb %xmm2, %xmm8, %xmm2 -; AVX1-NEXT: vmovdqa 688(%rbp), %ymm8 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm13 -; AVX1-NEXT: vpavgb %xmm2, %xmm13, %xmm2 -; AVX1-NEXT: vpavgb %xmm8, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa 208(%rbp), %ymm8 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm13 -; AVX1-NEXT: vmovdqa 720(%rbp), %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm15 -; AVX1-NEXT: vpavgb %xmm1, %xmm15, %xmm1 -; AVX1-NEXT: vpavgb %xmm2, %xmm8, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovdqa 240(%rbp), %ymm15 -; AVX1-NEXT: vmovdqa 752(%rbp), %ymm8 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm14 -; AVX1-NEXT: vpavgb %xmm2, %xmm14, %xmm2 -; AVX1-NEXT: vpavgb %xmm8, %xmm15, %xmm8 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 -; AVX1-NEXT: vmovaps %ymm2, 480(%rdi) -; AVX1-NEXT: vmovaps %ymm1, 448(%rdi) -; AVX1-NEXT: vmovaps %ymm13, 416(%rdi) -; AVX1-NEXT: vmovaps %ymm0, 384(%rdi) -; AVX1-NEXT: vmovaps %ymm9, 352(%rdi) +; AVX1-NEXT: vpavgb 336(%rbp), %xmm2, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpavgb 352(%rbp), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm13 +; AVX1-NEXT: vpavgb 368(%rbp), %xmm3, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vpavgb 384(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm14 +; AVX1-NEXT: vpavgb 400(%rbp), %xmm4, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vpavgb 416(%rbp), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm15 +; AVX1-NEXT: vpavgb 432(%rbp), %xmm5, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-NEXT: vpavgb 448(%rbp), %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm12 +; AVX1-NEXT: vpavgb 464(%rbp), %xmm6, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-NEXT: vpavgb 480(%rbp), %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-NEXT: vpavgb 496(%rbp), %xmm7, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-NEXT: vpavgb 512(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-NEXT: vmovdqa 16(%rbp), %xmm0 +; AVX1-NEXT: vmovdqa 32(%rbp), %xmm1 +; AVX1-NEXT: vpavgb 528(%rbp), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 544(%rbp), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 +; AVX1-NEXT: vmovdqa 48(%rbp), %xmm0 +; AVX1-NEXT: vmovdqa 64(%rbp), %xmm1 +; AVX1-NEXT: vpavgb 560(%rbp), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 576(%rbp), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX1-NEXT: vmovdqa 80(%rbp), %xmm0 +; AVX1-NEXT: vmovdqa 96(%rbp), %xmm1 +; AVX1-NEXT: vpavgb 592(%rbp), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 608(%rbp), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10 +; AVX1-NEXT: vmovdqa 112(%rbp), %xmm0 +; AVX1-NEXT: vmovdqa 128(%rbp), %xmm1 +; AVX1-NEXT: vpavgb 624(%rbp), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 640(%rbp), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa 144(%rbp), %xmm1 +; AVX1-NEXT: vmovdqa 160(%rbp), %xmm2 +; AVX1-NEXT: vpavgb 656(%rbp), %xmm1, %xmm1 +; AVX1-NEXT: vpavgb 672(%rbp), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa 176(%rbp), %xmm2 +; AVX1-NEXT: vmovdqa 192(%rbp), %xmm3 +; AVX1-NEXT: vpavgb 688(%rbp), %xmm2, %xmm2 +; AVX1-NEXT: vpavgb 704(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vmovdqa 208(%rbp), %xmm3 +; AVX1-NEXT: vmovdqa 224(%rbp), %xmm4 +; AVX1-NEXT: vpavgb 720(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vpavgb 736(%rbp), %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vmovdqa 240(%rbp), %xmm4 +; AVX1-NEXT: vpavgb 752(%rbp), %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa 256(%rbp), %xmm11 +; AVX1-NEXT: vpavgb 768(%rbp), %xmm11, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vmovaps %ymm4, 480(%rdi) +; AVX1-NEXT: vmovaps %ymm3, 448(%rdi) +; AVX1-NEXT: vmovaps %ymm2, 416(%rdi) +; AVX1-NEXT: vmovaps %ymm1, 384(%rdi) +; AVX1-NEXT: vmovaps %ymm0, 352(%rdi) ; AVX1-NEXT: vmovaps %ymm10, 320(%rdi) -; AVX1-NEXT: vmovaps %ymm11, 288(%rdi) -; AVX1-NEXT: vmovaps %ymm12, 256(%rdi) +; AVX1-NEXT: vmovaps %ymm9, 288(%rdi) +; AVX1-NEXT: vmovaps %ymm8, 256(%rdi) ; AVX1-NEXT: vmovaps %ymm7, 224(%rdi) ; AVX1-NEXT: vmovaps %ymm6, 192(%rdi) -; AVX1-NEXT: vmovaps %ymm5, 160(%rdi) -; AVX1-NEXT: vmovaps %ymm4, 128(%rdi) -; AVX1-NEXT: vmovaps %ymm3, 96(%rdi) +; AVX1-NEXT: vmovaps %ymm12, 160(%rdi) +; AVX1-NEXT: vmovaps %ymm15, 128(%rdi) +; AVX1-NEXT: vmovaps %ymm14, 96(%rdi) +; AVX1-NEXT: vmovaps %ymm13, 64(%rdi) ; AVX1-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %ymm0, (%rdi) Index: test/CodeGen/X86/avx-load-store.ll =================================================================== --- test/CodeGen/X86/avx-load-store.ll +++ test/CodeGen/X86/avx-load-store.ll @@ -255,8 +255,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups (%rsi), %xmm0 ; CHECK-NEXT: vmovups 16(%rsi), %xmm1 -; CHECK-NEXT: vmovups %xmm1, 16(%rdi) ; CHECK-NEXT: vmovups %xmm0, (%rdi) +; CHECK-NEXT: vmovups %xmm1, 16(%rdi) ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: add8i32: @@ -300,8 +300,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rsi), %xmm0 ; CHECK-NEXT: vmovaps 16(%rsi), %xmm1 -; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) ; CHECK-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: add4i64a16: Index: test/CodeGen/X86/avx-vzeroupper.ll =================================================================== --- test/CodeGen/X86/avx-vzeroupper.ll +++ test/CodeGen/X86/avx-vzeroupper.ll @@ -145,9 +145,7 @@ ; VZ-NEXT: # =>This Inner Loop Header: Depth=1 ; VZ-NEXT: callq do_sse ; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; VZ-NEXT: vextractf128 $1, %ymm0, %xmm0 -; VZ-NEXT: vzeroupper +; VZ-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 ; VZ-NEXT: callq do_sse ; VZ-NEXT: decl %ebx ; VZ-NEXT: jne .LBB3_3 @@ -176,8 +174,7 @@ ; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1 ; FAST-ymm-zmm-NEXT: callq do_sse ; FAST-ymm-zmm-NEXT: callq do_sse -; FAST-ymm-zmm-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; FAST-ymm-zmm-NEXT: vextractf128 $1, %ymm0, %xmm0 +; FAST-ymm-zmm-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 ; FAST-ymm-zmm-NEXT: callq do_sse ; FAST-ymm-zmm-NEXT: decl %ebx ; FAST-ymm-zmm-NEXT: jne .LBB3_3 @@ -206,9 +203,7 @@ ; BDVER2-NEXT: # =>This Inner Loop Header: Depth=1 ; BDVER2-NEXT: callq do_sse ; BDVER2-NEXT: callq do_sse -; BDVER2-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; BDVER2-NEXT: vzeroupper +; BDVER2-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 ; BDVER2-NEXT: callq do_sse ; BDVER2-NEXT: decl %ebx ; BDVER2-NEXT: jne .LBB3_3 @@ -237,8 +232,7 @@ ; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 ; BTVER2-NEXT: callq do_sse ; BTVER2-NEXT: callq do_sse -; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; BTVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; BTVER2-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 ; BTVER2-NEXT: callq do_sse ; BTVER2-NEXT: decl %ebx ; BTVER2-NEXT: jne .LBB3_3 Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -222,20 +222,19 @@ define <4 x float> @slto4f32_mem(<4 x i64>* %a) { ; NODQ-LABEL: slto4f32_mem: ; NODQ: # %bb.0: -; NODQ-NEXT: vmovdqu (%rdi), %ymm0 +; NODQ-NEXT: vmovdqu (%rdi), %xmm0 +; NODQ-NEXT: vmovdqu 16(%rdi), %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; NODQ-NEXT: vmovq %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax ; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; NODQ-NEXT: vzeroupper +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; NODQ-NEXT: retq ; ; VLDQ-LABEL: slto4f32_mem: @@ -253,20 +252,19 @@ ; ; KNL_WIDEN-LABEL: slto4f32_mem: ; KNL_WIDEN: # %bb.0: -; KNL_WIDEN-NEXT: vmovdqu (%rdi), %ymm0 +; KNL_WIDEN-NEXT: vmovdqu (%rdi), %xmm0 +; KNL_WIDEN-NEXT: vmovdqu 16(%rdi), %xmm1 ; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; KNL_WIDEN-NEXT: vmovq %xmm0, %rax ; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; KNL_WIDEN-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL_WIDEN-NEXT: vmovq %xmm0, %rax -; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; KNL_WIDEN-NEXT: vpextrq $1, %xmm0, %rax ; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; KNL_WIDEN-NEXT: vzeroupper +; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; KNL_WIDEN-NEXT: vmovq %xmm1, %rax +; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; KNL_WIDEN-NEXT: vpextrq $1, %xmm1, %rax +; KNL_WIDEN-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; KNL_WIDEN-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; KNL_WIDEN-NEXT: retq %a1 = load <4 x i64>, <4 x i64>* %a, align 8 %b = sitofp <4 x i64> %a1 to <4 x float> Index: test/CodeGen/X86/avx512-extract-subvector-load-store.ll =================================================================== --- test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -5,8 +5,7 @@ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $4, %k0, %k0 +; AVX512-NEXT: kmovb 4(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -16,9 +15,8 @@ ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 4(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 @@ -35,8 +33,7 @@ define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $6, %k0, %k0 +; AVX512-NEXT: kmovb 6(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -46,9 +43,8 @@ ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 6(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -65,8 +61,7 @@ define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovw (%rdi), %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: kmovb 8(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -76,8 +71,8 @@ ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 8(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 @@ -94,8 +89,7 @@ define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovw (%rdi), %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: kmovb 8(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512-NEXT: vpmovd2m %xmm2, %k1 @@ -105,8 +99,8 @@ ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 8(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 @@ -123,8 +117,7 @@ define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovw (%rdi), %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovb 14(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -134,8 +127,8 @@ ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 14(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -152,8 +145,7 @@ define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovw (%rdi), %k0 -; AVX512-NEXT: kshiftrw $12, %k0, %k0 +; AVX512-NEXT: kmovb 12(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm2, %k1 @@ -163,8 +155,8 @@ ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 12(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] @@ -181,8 +173,7 @@ define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -192,8 +183,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 @@ -210,8 +201,7 @@ define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512-NEXT: vpmovd2m %xmm2, %k1 @@ -221,8 +211,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 @@ -239,8 +229,7 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 @@ -251,8 +240,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 @@ -270,8 +259,7 @@ define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $30, %k0, %k0 +; AVX512-NEXT: kmovb 30(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -281,8 +269,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 30(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -299,8 +287,7 @@ define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $28, %k0, %k0 +; AVX512-NEXT: kmovb 28(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm2, %k1 @@ -310,8 +297,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 28(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] @@ -328,8 +315,7 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $24, %k0, %k0 +; AVX512-NEXT: kmovb 24(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 @@ -341,8 +327,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 24(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] @@ -361,8 +347,7 @@ define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -372,8 +357,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 @@ -390,8 +375,7 @@ define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512-NEXT: vpmovd2m %xmm2, %k1 @@ -401,8 +385,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 @@ -419,8 +403,7 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 @@ -431,8 +414,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 @@ -450,8 +433,7 @@ define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovw 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm2 ; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512-NEXT: vpmovd2m %zmm2, %k1 @@ -462,8 +444,7 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 @@ -480,8 +461,7 @@ define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $62, %k0, %k0 +; AVX512-NEXT: kmovb 62(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm2, %k1 @@ -491,8 +471,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 62(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] @@ -509,8 +489,7 @@ define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $60, %k0, %k0 +; AVX512-NEXT: kmovb 60(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm2, %k1 @@ -520,8 +499,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 60(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] @@ -538,8 +517,7 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $56, %k0, %k0 +; AVX512-NEXT: kmovb 56(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 @@ -551,8 +529,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 56(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] @@ -571,8 +549,7 @@ define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $48, %k0, %k0 +; AVX512-NEXT: kmovw 48(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm2 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2 @@ -584,8 +561,7 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $48, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 48(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2 @@ -603,17 +579,13 @@ define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $1, %k0, %k0 +; AVX512-NEXT: kmovb 1(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 1(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <2 x i1>, <2 x i1>* %a0 @@ -624,17 +596,13 @@ define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $1, %k0, %k0 +; AVX512-NEXT: kmovb 1(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 1(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <3 x i1>, <3 x i1>* %a0 @@ -645,17 +613,13 @@ define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $2, %k0, %k0 +; AVX512-NEXT: kmovb 2(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 2(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <3 x i1>, <3 x i1>* %a0 @@ -666,17 +630,13 @@ define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $2, %k0, %k0 +; AVX512-NEXT: kmovb 2(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 2(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <4 x i1>, <4 x i1>* %a0 @@ -687,17 +647,13 @@ define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $3, %k0, %k0 +; AVX512-NEXT: kmovb 3(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 3(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <4 x i1>, <4 x i1>* %a0 @@ -708,17 +664,13 @@ define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $4, %k0, %k0 +; AVX512-NEXT: kmovb 4(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 4(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <8 x i1>, <8 x i1>* %a0 @@ -729,8 +681,7 @@ define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $4, %k0, %k0 +; AVX512-NEXT: kmovb 4(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -739,9 +690,8 @@ ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 4(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 @@ -757,17 +707,13 @@ define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $7, %k0, %k0 +; AVX512-NEXT: kmovb 7(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 7(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <8 x i1>, <8 x i1>* %a0 @@ -778,8 +724,7 @@ define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovb (%rdi), %k0 -; AVX512-NEXT: kshiftrb $6, %k0, %k0 +; AVX512-NEXT: kmovb 6(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -788,9 +733,8 @@ ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax -; AVX512NOTDQ-NEXT: kmovd %eax, %k0 -; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 6(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] @@ -806,16 +750,13 @@ define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovw (%rdi), %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: kmovb 8(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 8(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 @@ -826,8 +767,7 @@ define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovw (%rdi), %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: kmovb 8(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -836,8 +776,8 @@ ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 8(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 @@ -853,8 +793,7 @@ define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovw (%rdi), %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k0 +; AVX512-NEXT: kmovb 8(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: vpmovd2m %xmm0, %k0 @@ -863,8 +802,8 @@ ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 8(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 @@ -880,16 +819,13 @@ define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovw (%rdi), %k0 -; AVX512-NEXT: kshiftrw $15, %k0, %k0 +; AVX512-NEXT: kmovb 15(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 15(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <16 x i1>, <16 x i1>* %a0 @@ -900,8 +836,7 @@ define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovw (%rdi), %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovb 14(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -910,8 +845,8 @@ ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 14(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] @@ -927,8 +862,7 @@ define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovw (%rdi), %k0 -; AVX512-NEXT: kshiftrw $12, %k0, %k0 +; AVX512-NEXT: kmovb 12(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm0, %k0 @@ -937,8 +871,8 @@ ; ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 12(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -954,16 +888,13 @@ define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 16(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 @@ -974,8 +905,7 @@ define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -984,8 +914,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 @@ -1001,8 +931,7 @@ define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: vpmovd2m %xmm0, %k0 @@ -1011,8 +940,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 @@ -1028,8 +957,7 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k0 +; AVX512-NEXT: kmovb 16(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 @@ -1039,8 +967,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 16(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 @@ -1057,16 +985,13 @@ define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $31, %k0, %k0 +; AVX512-NEXT: kmovb 31(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $31, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 31(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <32 x i1>, <32 x i1>* %a0 @@ -1077,8 +1002,7 @@ define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $30, %k0, %k0 +; AVX512-NEXT: kmovb 30(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -1087,8 +1011,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 30(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] @@ -1104,8 +1028,7 @@ define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $28, %k0, %k0 +; AVX512-NEXT: kmovb 28(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm0, %k0 @@ -1114,8 +1037,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 28(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -1131,8 +1054,7 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovd (%rdi), %k0 -; AVX512-NEXT: kshiftrd $24, %k0, %k0 +; AVX512-NEXT: kmovb 24(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 @@ -1143,8 +1065,8 @@ ; ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 24(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] @@ -1162,16 +1084,13 @@ define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 32(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 @@ -1182,8 +1101,7 @@ define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -1192,8 +1110,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 @@ -1209,8 +1127,7 @@ define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512-NEXT: vpmovd2m %xmm0, %k0 @@ -1219,8 +1136,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 @@ -1236,8 +1153,7 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovb 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 @@ -1247,8 +1163,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 32(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 @@ -1265,8 +1181,7 @@ define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $32, %k0, %k0 +; AVX512-NEXT: kmovw 32(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm0 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512-NEXT: vpmovd2m %zmm0, %k0 @@ -1276,8 +1191,7 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 32(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1292,16 +1206,13 @@ define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $63, %k0, %k0 +; AVX512-NEXT: kmovb 63(%rdi), %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: retq ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $63, %k0, %k0 -; AVX512NOTDQ-NEXT: kmovd %k0, %eax +; AVX512NOTDQ-NEXT: movb 63(%rdi), %al ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: retq %d0 = load <64 x i1>, <64 x i1>* %a0 @@ -1312,8 +1223,7 @@ define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $62, %k0, %k0 +; AVX512-NEXT: kmovb 62(%rdi), %k0 ; AVX512-NEXT: vpmovm2q %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512-NEXT: vpmovq2m %xmm0, %k0 @@ -1322,8 +1232,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 62(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] @@ -1339,8 +1249,7 @@ define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $60, %k0, %k0 +; AVX512-NEXT: kmovb 60(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vpmovd2m %xmm0, %k0 @@ -1349,8 +1258,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 60(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -1366,8 +1275,7 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $56, %k0, %k0 +; AVX512-NEXT: kmovb 56(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] ; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 @@ -1378,8 +1286,8 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1 +; AVX512NOTDQ-NEXT: movzbl 56(%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] @@ -1397,8 +1305,7 @@ define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: kmovq (%rdi), %k0 -; AVX512-NEXT: kshiftrq $48, %k0, %k0 +; AVX512-NEXT: kmovw 48(%rdi), %k0 ; AVX512-NEXT: vpmovm2d %k0, %zmm0 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 @@ -1409,8 +1316,7 @@ ; ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 -; AVX512NOTDQ-NEXT: kshiftrq $48, %k0, %k1 +; AVX512NOTDQ-NEXT: kmovw 48(%rdi), %k1 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -1776,25 +1776,21 @@ ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm0, (%rsp) ; KNL-NEXT: setne (%rsp,%rsi) -; KNL-NEXT: vmovdqa (%rsp), %ymm0 -; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: vpmovsxbd %xmm1, %zmm0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1961,51 +1957,43 @@ ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm2, (%rsp) ; KNL-NEXT: setne (%rsp,%rax) -; KNL-NEXT: vmovdqa (%rsp), %ymm2 -; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3 -; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 -; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm4 -; KNL-NEXT: vpslld $31, %zmm4, %zmm4 -; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 +; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: vpmovsxbd %xmm3, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm2 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: shll $16, %eax ; KNL-NEXT: orl %edx, %eax ; KNL-NEXT: shlq $32, %rax ; KNL-NEXT: orq %rcx, %rax -; KNL-NEXT: vpmovsxbd %xmm1, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %esi ; KNL-NEXT: shll $16, %esi ; KNL-NEXT: orl %ecx, %esi -; KNL-NEXT: vpmovsxbd %xmm0, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %edx @@ -2181,51 +2169,43 @@ ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm0, (%rsp) ; KNL-NEXT: setne (%rsp,%rsi) -; KNL-NEXT: vmovdqa (%rsp), %ymm2 -; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3 -; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 -; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm4 -; KNL-NEXT: vpslld $31, %zmm4, %zmm4 -; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0 +; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: vpmovsxbd %xmm3, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm2 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: shll $16, %eax ; KNL-NEXT: orl %edx, %eax ; KNL-NEXT: shlq $32, %rax ; KNL-NEXT: orq %rcx, %rax -; KNL-NEXT: vpmovsxbd %xmm1, %zmm2 -; KNL-NEXT: vpslld $31, %zmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %esi ; KNL-NEXT: shll $16, %esi ; KNL-NEXT: orl %ecx, %esi -; KNL-NEXT: vpmovsxbd %xmm0, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 +; KNL-NEXT: vpslld $31, %zmm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %edx Index: test/CodeGen/X86/avx512-shuffles/partial_permute.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -162,12 +162,11 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -176,14 +175,13 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -195,14 +193,13 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -214,14 +211,13 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13] ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3] ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -233,14 +229,13 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13] ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3] ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -252,14 +247,12 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpsrld $16, %xmm2, %xmm3 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3] -; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] +; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 +; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3] +; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -271,14 +264,12 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vpsrld $16, %xmm1, %xmm2 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3] -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] +; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm1 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3] +; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -290,11 +281,9 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -303,13 +292,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = mem[0],xmm2[1,2,3] ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5] ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -321,13 +308,11 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1,2,3] ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -625,10 +610,9 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] -; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -637,12 +621,11 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] -; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -654,11 +637,10 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -671,12 +653,11 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] -; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -688,11 +669,10 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -705,12 +685,11 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] -; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] +; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -722,11 +701,10 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -739,10 +717,9 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) { ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] -; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -751,12 +728,11 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] -; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -768,11 +744,10 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -785,10 +760,9 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -799,12 +773,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -817,12 +790,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -835,12 +807,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -853,12 +824,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -871,12 +841,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -889,12 +858,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -907,10 +875,9 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) { ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -921,12 +888,11 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -939,12 +905,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u> -; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u> +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %vp @@ -1102,10 +1067,8 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,1],xmm0[0,0] -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovaps 16(%rdi), %xmm0 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],mem[0,0] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1114,12 +1077,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0] +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],mem[0,0] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1131,12 +1092,10 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0] +; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],mem[0,0] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1148,12 +1107,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1165,12 +1122,10 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1182,12 +1137,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 +; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1199,12 +1152,10 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1216,12 +1167,9 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,2,3] +; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1230,14 +1178,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = mem[1,1,2,3] +; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm3 +; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1249,14 +1194,11 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = mem[1,1,2,3] +; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1592,12 +1534,11 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,6,11,0,1,5,15] -; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15] +; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1609,11 +1550,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1626,12 +1566,11 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,14,1,5,4,2,8,10] -; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10] +; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1643,11 +1582,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1660,10 +1598,9 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12] -; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1672,12 +1609,11 @@ define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,4,1,13,15,4,6,12] -; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12] +; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -1689,11 +1625,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1706,10 +1641,9 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <13,0,0,6,u,u,u,u> -; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1720,12 +1654,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <13,0,0,6,u,u,u,u> -; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u> +; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1738,12 +1671,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u> -; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <13,0,0,6,u,u,u,u> +; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1756,12 +1688,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [15,5,3,2,15,5,7,6] -; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6] +; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1774,12 +1705,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6] -; CHECK-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [15,5,3,2,15,5,7,6] +; CHECK-NEXT: vpermi2d (%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1792,12 +1722,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <2,15,6,9,u,u,u,u> -; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u> +; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1810,12 +1739,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u> -; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <2,15,6,9,u,u,u,u> +; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1828,16 +1756,15 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 ; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vpextrd $3, %xmm1, %eax ; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm1 ; CHECK-NEXT: vpextrd $2, %xmm0, %eax ; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1846,9 +1773,9 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 ; CHECK-NEXT: vmovd %xmm2, %eax -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 ; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] ; CHECK-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 ; CHECK-NEXT: vpextrd $3, %xmm3, %eax @@ -1857,7 +1784,6 @@ ; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1869,9 +1795,9 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 ; CHECK-NEXT: vmovd %xmm1, %eax -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] ; CHECK-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 ; CHECK-NEXT: vpextrd $3, %xmm2, %eax @@ -1880,7 +1806,6 @@ ; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1959,10 +1884,8 @@ define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) { ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -1971,11 +1894,9 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} = xmm2[1],xmm3[1] -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} = xmm2[1],mem[1] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -1987,11 +1908,9 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm1[1],xmm2[1] -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[1] ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -2003,12 +1922,10 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 +; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -2020,12 +1937,10 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -2402,12 +2317,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,4] -; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,4] +; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2419,11 +2333,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2436,12 +2349,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,5,5,1] -; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,5,5,1] +; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2453,11 +2365,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2470,10 +2381,9 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2] -; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2482,12 +2392,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,0,0,2] -; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,0,0,2] +; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2499,11 +2408,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2516,12 +2424,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,6,1] -; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,6,1] +; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2533,11 +2440,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2550,12 +2456,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,7,1] -; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,7,1] +; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2567,11 +2472,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2584,10 +2488,9 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2] -; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2596,12 +2499,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,2,3,2] -; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,2,3,2] +; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2613,11 +2515,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa (%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2630,12 +2531,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,1,5] -; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,1,5] +; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2647,11 +2547,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2664,11 +2563,9 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovaps 32(%rdi), %xmm1 ; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2677,12 +2574,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti32x4 $2, %zmm2, %xmm3 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: vmovdqa 32(%rdi), %xmm3 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2694,12 +2589,10 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2711,9 +2604,8 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} @@ -2729,9 +2621,8 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} @@ -2894,11 +2785,10 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0] ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,1] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2907,13 +2797,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,0] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[2,0],xmm3[0,1] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2925,13 +2814,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,0],xmm2[0,1] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2943,13 +2831,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 +; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2961,13 +2847,11 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2979,13 +2863,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 ; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,0] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[3,1],xmm3[2,0] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2997,13 +2880,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[3,1],xmm2[2,0] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3015,11 +2897,10 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0] ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3028,13 +2909,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm2 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 ; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,0] ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[1,3],xmm3[0,2] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3046,13 +2926,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[3,0] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3],xmm2[0,2] -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3372,10 +3251,9 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovaps (%rdi), %ymm1 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4] -; CHECK-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3384,13 +3262,12 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [7,6,7,11,5,10,0,4] -; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovaps (%rdi), %ymm2 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4] +; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3402,12 +3279,11 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4] -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3420,13 +3296,12 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [11,0,9,0,7,14,0,8] -; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovaps (%rdi), %ymm2 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8] +; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3438,12 +3313,11 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovaps (%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8] -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3456,13 +3330,12 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1] -; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] +; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3474,12 +3347,11 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3492,10 +3364,9 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9] -; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3504,13 +3375,12 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [7,5,3,3,11,4,12,9] -; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9] +; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3522,12 +3392,11 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9] -; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3540,12 +3409,9 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,3,3] -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,3,3] +; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = mem[3,1,2,3] +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3555,12 +3421,9 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,2,3,3] -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2 -; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,1,2,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = mem[0,2,3,3] +; CHECK-NEXT: vpermpd {{.*#+}} ymm3 = mem[3,1,2,3] +; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} @@ -3576,12 +3439,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,3] -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3] +; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,2,3,3] +; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = mem[3,1,2,3] +; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z} @@ -3597,13 +3457,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,10,6,15,4,14,6,15] -; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15] +; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3616,13 +3475,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15] -; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,10,6,15,4,14,6,15] +; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3635,13 +3493,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [4,14,4,14,4,14,6,7] -; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7] +; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3654,13 +3511,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7] -; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,14,4,14,4,14,6,7] +; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3673,10 +3529,9 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovaps (%rdi), %ymm1 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <3,3,15,9,u,u,u,u> -; CHECK-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3687,13 +3542,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = <3,3,15,9,u,u,u,u> -; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovaps (%rdi), %ymm2 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u> +; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3706,13 +3560,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u> -; CHECK-NEXT: vpermi2ps %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <3,3,15,9,u,u,u,u> +; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3795,10 +3648,8 @@ define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) { ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: retq %vec = load <4 x double>, <4 x double>* %vp %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3807,13 +3658,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vblendpd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; CHECK-NEXT: vmovapd (%rdi), %xmm2 +; CHECK-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3825,13 +3674,11 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; CHECK-NEXT: vmovapd (%rdi), %xmm1 +; CHECK-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3843,12 +3690,10 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0] -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovapd 16(%rdi), %xmm2 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],mem[0] ; CHECK-NEXT: retq %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3860,12 +3705,10 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0] -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovapd 16(%rdi), %xmm1 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],mem[0] ; CHECK-NEXT: retq %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -4231,10 +4074,9 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,7,2] -; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4243,13 +4085,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,6,7,2] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,7,2] +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4261,12 +4102,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -4279,13 +4119,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,4,2,4] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,4] +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4297,12 +4136,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,4] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -4315,13 +4153,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,2,3,4] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,3,4] +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4333,12 +4170,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -4351,10 +4187,9 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0] -; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4363,13 +4198,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [4,2,1,0] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [4,2,1,0] +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4381,12 +4215,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -4399,13 +4232,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,4,1,5] -; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,4,1,5] +; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4417,12 +4249,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [2,4,1,5] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -4435,13 +4266,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [6,1,1,1] -; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [6,1,1,1] +; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4453,12 +4283,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [6,1,1,1] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -4471,9 +4300,8 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1 +; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,6,1] ; CHECK-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: retq @@ -4484,11 +4312,10 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2 +; CHECK-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm2 +; CHECK-NEXT: vmovapd 32(%rdi), %ymm3 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,6,1] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm4 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 ; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} @@ -4503,9 +4330,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; CHECK-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm2 +; CHECK-NEXT: vmovapd 32(%rdi), %ymm3 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,6,1] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 @@ -4522,13 +4348,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,5,2,5] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,5,2,5] +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} +; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4540,12 +4365,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,5,2,5] -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -4558,10 +4382,9 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovapd (%rdi), %ymm1 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,3,6] -; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -4572,13 +4395,12 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,6,3,6] -; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,3,6] +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovapd %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -4591,13 +4413,12 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,3,6] -; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: vmovapd (%rdi), %ymm1 +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,6,3,6] +; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm2 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 -; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -4610,12 +4431,10 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm2 -; CHECK-NEXT: vextractf32x4 $2, %zmm2, %xmm3 -; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1 -; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],xmm3[0] -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovapd (%rdi), %xmm2 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4627,12 +4446,10 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm2 -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1 -; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],xmm2[0] -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovapd (%rdi), %xmm1 +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[0] ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> Index: test/CodeGen/X86/fma.ll =================================================================== --- test/CodeGen/X86/fma.ll +++ test/CodeGen/X86/fma.ll @@ -1071,9 +1071,7 @@ ; FMACALL32_BDVER2-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] ; FMACALL32_BDVER2-NEXT: subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## imm = 0x1C0 -; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm4 ## encoding: [0xc5,0xf8,0x28,0x65,0x38] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] @@ -1082,46 +1080,43 @@ ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm4, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x64,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] @@ -1134,7 +1129,7 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] @@ -1147,7 +1142,7 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] @@ -1160,7 +1155,7 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] @@ -1173,36 +1168,32 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] @@ -1210,31 +1201,29 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] @@ -1247,7 +1236,7 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] @@ -1260,9 +1249,9 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] @@ -1273,9 +1262,9 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xd0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] @@ -1286,93 +1275,92 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xa0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xd0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x50] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x5c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x54] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x4c] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x38,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x48,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0],xmm2[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x44,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0],xmm2[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x40,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x58,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x54,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x40,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x50,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] @@ -1844,11 +1832,11 @@ ; FMACALL32_BDVER2-NEXT: pushl %ebp ## encoding: [0x55] ; FMACALL32_BDVER2-NEXT: movl %esp, %ebp ## encoding: [0x89,0xe5] ; FMACALL32_BDVER2-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] -; FMACALL32_BDVER2-NEXT: subl $384, %esp ## encoding: [0x81,0xec,0x80,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: ## imm = 0x180 +; FMACALL32_BDVER2-NEXT: subl $352, %esp ## encoding: [0x81,0xec,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## imm = 0x160 ; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] @@ -1857,21 +1845,18 @@ ; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x40,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x50,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x40,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm2[0] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovapd 40(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovapd 40(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] @@ -1884,7 +1869,7 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28] +; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] @@ -1897,12 +1882,9 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovapd 24(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32_BDVER2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x29,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] @@ -1910,30 +1892,29 @@ ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0x30,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],xmm1[1] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x30,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x14,0x44,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovapd 8(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovapd 8(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] @@ -1946,7 +1927,7 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08] +; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10] @@ -1959,13 +1940,12 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovapd 56(%ebp), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x45,0x38] ; FMACALL32_BDVER2-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x40,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x58,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x48,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] Index: test/CodeGen/X86/insert-into-constant-vector.ll =================================================================== --- test/CodeGen/X86/insert-into-constant-vector.ll +++ test/CodeGen/X86/insert-into-constant-vector.ll @@ -439,9 +439,9 @@ ; ; X64AVX1-LABEL: elt5_v8i64: ; X64AVX1: # %bb.0: -; X64AVX1-NEXT: vmovdqa {{.*#+}} ymm0 = <4,u,6,7> -; X64AVX1-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 -; X64AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X64AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <4,u,6,7> +; X64AVX1-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 +; X64AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; X64AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,1,2,3] ; X64AVX1-NEXT: retq ; @@ -457,9 +457,9 @@ ; ; X64AVX2-LABEL: elt5_v8i64: ; X64AVX2: # %bb.0: -; X64AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <4,u,6,7> -; X64AVX2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 -; X64AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X64AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <4,u,6,7> +; X64AVX2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 +; X64AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; X64AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,1,2,3] ; X64AVX2-NEXT: retq ; @@ -507,47 +507,49 @@ ; ; X32AVX1-LABEL: elt1_v8f64: ; X32AVX1: # %bb.0: -; X32AVX1-NEXT: vmovapd {{.*#+}} ymm0 = <4.2E+1,u,2.0E+0,3.0E+0> -; X32AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; X32AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; X32AVX1-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> +; X32AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X32AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] ; X32AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X32AVX1-NEXT: retl ; ; X64AVX1-LABEL: elt1_v8f64: ; X64AVX1: # %bb.0: -; X64AVX1-NEXT: vmovaps {{.*#+}} ymm1 = <4.2E+1,u,2.0E+0,3.0E+0> +; X64AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> ; X64AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X64AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; X64AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X64AVX1-NEXT: retq ; ; X32AVX2-LABEL: elt1_v8f64: ; X32AVX2: # %bb.0: -; X32AVX2-NEXT: vmovapd {{.*#+}} ymm0 = <4.2E+1,u,2.0E+0,3.0E+0> -; X32AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; X32AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; X32AVX2-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0> +; X32AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X32AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] ; X32AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X32AVX2-NEXT: retl ; ; X64AVX2-LABEL: elt1_v8f64: ; X64AVX2: # %bb.0: -; X64AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <4.2E+1,u,2.0E+0,3.0E+0> +; X64AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0> ; X64AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X64AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; X64AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X64AVX2-NEXT: retq ; ; X32AVX512F-LABEL: elt1_v8f64: ; X32AVX512F: # %bb.0: -; X32AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> -; X32AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; X32AVX512F-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; X32AVX512F-NEXT: vmovapd {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X32AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X32AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X32AVX512F-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 ; X32AVX512F-NEXT: retl ; ; X64AVX512F-LABEL: elt1_v8f64: ; X64AVX512F: # %bb.0: -; X64AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> +; X64AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> ; X64AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0> ; X64AVX512F-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 ; X64AVX512F-NEXT: retq %ins = insertelement <8 x double> , double %x, i32 1 Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -234,16 +234,14 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB2_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rdi,%rcx,2), %ymm2 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 +; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB2_1 @@ -398,46 +396,42 @@ ; AVX1-LABEL: _Z10test_shortPsS_i_1024: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: xorl %ecx, %ecx ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB3_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rdi,%rcx,2), %ymm3 -; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,2), %ymm4 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %ymm5 -; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm6 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0 -; AVX1-NEXT: vpmaddwd %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm3 +; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm4 +; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %xmm5 +; AVX1-NEXT: vmovdqu 48(%rsi,%rcx,2), %xmm6 +; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpmaddwd %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4 -; AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 +; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB3_1 ; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vpaddd %xmm8, %xmm2, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm4 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] @@ -2355,13 +2349,11 @@ ; ; AVX1-LABEL: pmaddwd_256: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa (%rsi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX256-LABEL: pmaddwd_256: @@ -2400,20 +2392,16 @@ ; ; AVX1-LABEL: pmaddwd_512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vmovdqa (%rsi), %ymm2 -; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1 +; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pmaddwd_512: @@ -2458,61 +2446,53 @@ ; SSE2-LABEL: pmaddwd_1024: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movdqa 112(%rsi), %xmm0 -; SSE2-NEXT: movdqa 96(%rsi), %xmm1 -; SSE2-NEXT: movdqa 80(%rsi), %xmm2 -; SSE2-NEXT: movdqa 64(%rsi), %xmm3 -; SSE2-NEXT: movdqa (%rsi), %xmm4 -; SSE2-NEXT: movdqa 16(%rsi), %xmm5 -; SSE2-NEXT: movdqa 32(%rsi), %xmm6 -; SSE2-NEXT: movdqa 48(%rsi), %xmm7 -; SSE2-NEXT: pmaddwd (%rdx), %xmm4 -; SSE2-NEXT: pmaddwd 16(%rdx), %xmm5 -; SSE2-NEXT: pmaddwd 32(%rdx), %xmm6 -; SSE2-NEXT: pmaddwd 48(%rdx), %xmm7 -; SSE2-NEXT: pmaddwd 64(%rdx), %xmm3 -; SSE2-NEXT: pmaddwd 80(%rdx), %xmm2 -; SSE2-NEXT: pmaddwd 96(%rdx), %xmm1 -; SSE2-NEXT: pmaddwd 112(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 112(%rdi) -; SSE2-NEXT: movdqa %xmm1, 96(%rdi) -; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm3, 64(%rdi) -; SSE2-NEXT: movdqa %xmm7, 48(%rdi) -; SSE2-NEXT: movdqa %xmm6, 32(%rdi) -; SSE2-NEXT: movdqa %xmm5, 16(%rdi) -; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: pmaddwd (%rdx), %xmm0 +; SSE2-NEXT: pmaddwd 16(%rdx), %xmm1 +; SSE2-NEXT: pmaddwd 32(%rdx), %xmm2 +; SSE2-NEXT: pmaddwd 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 64(%rsi), %xmm4 +; SSE2-NEXT: pmaddwd 64(%rdx), %xmm4 +; SSE2-NEXT: movdqa 80(%rsi), %xmm5 +; SSE2-NEXT: pmaddwd 80(%rdx), %xmm5 +; SSE2-NEXT: movdqa 96(%rsi), %xmm6 +; SSE2-NEXT: pmaddwd 96(%rdx), %xmm6 +; SSE2-NEXT: movdqa 112(%rsi), %xmm7 +; SSE2-NEXT: pmaddwd 112(%rdx), %xmm7 +; SSE2-NEXT: movdqa %xmm7, 112(%rdi) +; SSE2-NEXT: movdqa %xmm6, 96(%rdi) +; SSE2-NEXT: movdqa %xmm5, 80(%rdi) +; SSE2-NEXT: movdqa %xmm4, 64(%rdi) +; SSE2-NEXT: movdqa %xmm3, 48(%rdi) +; SSE2-NEXT: movdqa %xmm2, 32(%rdi) +; SSE2-NEXT: movdqa %xmm1, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_1024: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX1-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX1-NEXT: vmovdqa (%rsi), %ymm4 -; AVX1-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX1-NEXT: vmovdqa 64(%rsi), %ymm6 -; AVX1-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmaddwd %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm4 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmaddwd %xmm9, %xmm8, %xmm4 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1 +; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-NEXT: vpmaddwd 80(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX1-NEXT: vpmaddwd 64(%rsi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-NEXT: vpmaddwd 112(%rsi), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX1-NEXT: vpmaddwd 96(%rsi), %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/nontemporal-loads.ll =================================================================== --- test/CodeGen/X86/nontemporal-loads.ll +++ test/CodeGen/X86/nontemporal-loads.ll @@ -852,12 +852,12 @@ ; ; AVX1-LABEL: test_arg_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v8i32: @@ -932,12 +932,12 @@ ; ; AVX1-LABEL: test_arg_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v4i64: @@ -973,12 +973,12 @@ ; ; AVX1-LABEL: test_arg_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v16i16: @@ -1014,12 +1014,12 @@ ; ; AVX1-LABEL: test_arg_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v32i8: @@ -1114,18 +1114,18 @@ ; ; AVX1-LABEL: test_arg_v16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v16i32: @@ -1220,18 +1220,18 @@ ; ; AVX1-LABEL: test_arg_v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v8i64: @@ -1275,18 +1275,18 @@ ; ; AVX1-LABEL: test_arg_v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 +; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v32i16: @@ -1346,18 +1346,18 @@ ; ; AVX1-LABEL: test_arg_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v64i8: Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -1055,26 +1055,25 @@ ; ; AVX1-LABEL: interleave_24i16_out: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm0 -; AVX1-NEXT: vmovdqu (%rdi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX1-NEXT: vmovdqu %xmm3, (%rsi) ; AVX1-NEXT: vmovdqu %xmm4, (%rdx) ; AVX1-NEXT: vmovdqu %xmm0, (%rcx) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleave_24i16_out: @@ -1101,19 +1100,18 @@ ; ; XOP-LABEL: interleave_24i16_out: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqu 32(%rdi), %xmm0 -; XOP-NEXT: vmovdqu (%rdi), %ymm1 -; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm0[4,5,10,11] -; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm0[0,1,6,7,12,13] -; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[4,5,10,11],xmm2[0,1,6,7,12,13,14,15,0,1,2,3] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7,8,9],xmm0[2,3,8,9,14,15] +; XOP-NEXT: vmovdqu (%rdi), %xmm0 +; XOP-NEXT: vmovdqu 16(%rdi), %xmm1 +; XOP-NEXT: vmovdqu 32(%rdi), %xmm2 +; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm2[4,5,10,11] +; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm2[0,1,6,7,12,13] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11],xmm1[0,1,6,7,12,13,14,15,0,1,2,3] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9],xmm2[2,3,8,9,14,15] ; XOP-NEXT: vmovdqu %xmm3, (%rsi) ; XOP-NEXT: vmovdqu %xmm4, (%rdx) ; XOP-NEXT: vmovdqu %xmm0, (%rcx) -; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %wide.vec = load <24 x i16>, <24 x i16>* %p, align 4 %s1 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> @@ -1353,9 +1351,9 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovups (%rdi), %ymm0 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; AVX1-NEXT: vmovups 64(%rdi), %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1] +; AVX1-NEXT: vmovups 80(%rdi), %xmm2 +; AVX1-NEXT: vmovups 64(%rdi), %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm2[1] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 @@ -1364,7 +1362,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3] +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm2[2],xmm3[3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -1374,7 +1372,7 @@ ; AVX1-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -1451,9 +1449,9 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovups (%rdi), %ymm0 ; XOP-NEXT: vmovups 32(%rdi), %ymm1 -; XOP-NEXT: vmovups 64(%rdi), %ymm2 -; XOP-NEXT: vextractf128 $1, %ymm2, %xmm3 -; XOP-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1] +; XOP-NEXT: vmovups 80(%rdi), %xmm2 +; XOP-NEXT: vmovups 64(%rdi), %xmm3 +; XOP-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm2[1] ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; XOP-NEXT: vextractf128 $1, %ymm5, %xmm6 @@ -1462,7 +1460,7 @@ ; XOP-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3] ; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; XOP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; XOP-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3] +; XOP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm2[2],xmm3[3] ; XOP-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2] ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; XOP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -1472,7 +1470,7 @@ ; XOP-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3] ; XOP-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,3] ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -1583,35 +1581,36 @@ ; ; AVX1-LABEL: interleave_24i32_in: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovups (%rsi), %ymm0 -; AVX1-NEXT: vmovups (%rdx), %ymm1 -; AVX1-NEXT: vmovupd (%rcx), %ymm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm2[0,0] +; AVX1-NEXT: vmovupd (%rsi), %ymm0 +; AVX1-NEXT: vmovupd (%rcx), %ymm1 +; AVX1-NEXT: vmovups 16(%rcx), %xmm2 +; AVX1-NEXT: vmovups (%rdx), %xmm3 +; AVX1-NEXT: vmovups 16(%rdx), %xmm4 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX1-NEXT: vmovups (%rsi), %xmm4 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) -; AVX1-NEXT: vmovups %ymm4, 64(%rdi) ; AVX1-NEXT: vmovups %ymm3, (%rdi) +; AVX1-NEXT: vmovups %ymm2, 64(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1620,17 +1619,17 @@ ; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0 ; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm4 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = mem[1,0,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3] +; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] @@ -1638,8 +1637,8 @@ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-SLOW-NEXT: vmovups %ymm4, 64(%rdi) -; AVX2-SLOW-NEXT: vmovups %ymm3, (%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm3, 64(%rdi) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1648,58 +1647,59 @@ ; AVX2-FAST-NEXT: vmovups (%rsi), %ymm0 ; AVX2-FAST-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-FAST-NEXT: vbroadcastsd %xmm2, %ymm4 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, 64(%rdi) -; AVX2-FAST-NEXT: vmovups %ymm4, 32(%rdi) -; AVX2-FAST-NEXT: vmovups %ymm3, (%rdi) +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-FAST-NEXT: vmovups %ymm4, (%rdi) +; AVX2-FAST-NEXT: vmovups %ymm3, 64(%rdi) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; XOP-LABEL: interleave_24i32_in: ; XOP: # %bb.0: -; XOP-NEXT: vmovups (%rsi), %ymm0 -; XOP-NEXT: vmovups (%rdx), %ymm1 -; XOP-NEXT: vmovupd (%rcx), %ymm2 -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0] -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0] -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1] -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm2[0,0] +; XOP-NEXT: vmovupd (%rsi), %ymm0 +; XOP-NEXT: vmovupd (%rcx), %ymm1 +; XOP-NEXT: vmovups 16(%rcx), %xmm2 +; XOP-NEXT: vmovups (%rdx), %xmm3 +; XOP-NEXT: vmovups 16(%rdx), %xmm4 +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0] +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2] +; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] +; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; XOP-NEXT: vmovups (%rsi), %xmm4 +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0] +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] +; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; XOP-NEXT: vextractf128 $1, %ymm2, %xmm4 -; XOP-NEXT: vextractf128 $1, %ymm1, %xmm5 -; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0] -; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0] -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2] -; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3] -; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; XOP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm2[2],ymm0[3],ymm2[2,3],ymm0[4],ymm2[5,4],ymm0[5] -; XOP-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5] +; XOP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; XOP-NEXT: vmovups %ymm0, 32(%rdi) -; XOP-NEXT: vmovups %ymm4, 64(%rdi) ; XOP-NEXT: vmovups %ymm3, (%rdi) +; XOP-NEXT: vmovups %ymm2, 64(%rdi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %s1 = load <8 x i32>, <8 x i32>* %q1, align 4 Index: test/CodeGen/X86/pmaddubsw.ll =================================================================== --- test/CodeGen/X86/pmaddubsw.ll +++ test/CodeGen/X86/pmaddubsw.ll @@ -51,13 +51,11 @@ ; ; AVX1-LABEL: pmaddubsw_256: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa (%rsi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaddubsw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX256-LABEL: pmaddubsw_256: @@ -90,61 +88,53 @@ ; SSE-LABEL: pmaddubsw_512: ; SSE: # %bb.0: ; SSE-NEXT: movq %rdi, %rax -; SSE-NEXT: movdqa 112(%rdx), %xmm0 -; SSE-NEXT: movdqa 96(%rdx), %xmm1 -; SSE-NEXT: movdqa 80(%rdx), %xmm2 -; SSE-NEXT: movdqa 64(%rdx), %xmm3 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa 16(%rdx), %xmm5 -; SSE-NEXT: movdqa 32(%rdx), %xmm6 -; SSE-NEXT: movdqa 48(%rdx), %xmm7 -; SSE-NEXT: pmaddubsw (%rsi), %xmm4 -; SSE-NEXT: pmaddubsw 16(%rsi), %xmm5 -; SSE-NEXT: pmaddubsw 32(%rsi), %xmm6 -; SSE-NEXT: pmaddubsw 48(%rsi), %xmm7 -; SSE-NEXT: pmaddubsw 64(%rsi), %xmm3 -; SSE-NEXT: pmaddubsw 80(%rsi), %xmm2 -; SSE-NEXT: pmaddubsw 96(%rsi), %xmm1 -; SSE-NEXT: pmaddubsw 112(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 112(%rdi) -; SSE-NEXT: movdqa %xmm1, 96(%rdi) -; SSE-NEXT: movdqa %xmm2, 80(%rdi) -; SSE-NEXT: movdqa %xmm3, 64(%rdi) -; SSE-NEXT: movdqa %xmm7, 48(%rdi) -; SSE-NEXT: movdqa %xmm6, 32(%rdi) -; SSE-NEXT: movdqa %xmm5, 16(%rdi) -; SSE-NEXT: movdqa %xmm4, (%rdi) +; SSE-NEXT: movdqa (%rdx), %xmm0 +; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa 32(%rdx), %xmm2 +; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: pmaddubsw (%rsi), %xmm0 +; SSE-NEXT: pmaddubsw 16(%rsi), %xmm1 +; SSE-NEXT: pmaddubsw 32(%rsi), %xmm2 +; SSE-NEXT: pmaddubsw 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 64(%rdx), %xmm4 +; SSE-NEXT: pmaddubsw 64(%rsi), %xmm4 +; SSE-NEXT: movdqa 80(%rdx), %xmm5 +; SSE-NEXT: pmaddubsw 80(%rsi), %xmm5 +; SSE-NEXT: movdqa 96(%rdx), %xmm6 +; SSE-NEXT: pmaddubsw 96(%rsi), %xmm6 +; SSE-NEXT: movdqa 112(%rdx), %xmm7 +; SSE-NEXT: pmaddubsw 112(%rsi), %xmm7 +; SSE-NEXT: movdqa %xmm7, 112(%rdi) +; SSE-NEXT: movdqa %xmm6, 96(%rdi) +; SSE-NEXT: movdqa %xmm5, 80(%rdi) +; SSE-NEXT: movdqa %xmm4, 64(%rdi) +; SSE-NEXT: movdqa %xmm3, 48(%rdi) +; SSE-NEXT: movdqa %xmm2, 32(%rdi) +; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: pmaddubsw_512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX1-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX1-NEXT: vmovdqa (%rsi), %ymm4 -; AVX1-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX1-NEXT: vmovdqa 64(%rsi), %ymm6 -; AVX1-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7 -; AVX1-NEXT: vpmaddubsw %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpmaddubsw %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmaddubsw %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmaddubsw %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm4 -; AVX1-NEXT: vpmaddubsw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmaddubsw %xmm8, %xmm9, %xmm4 +; AVX1-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmaddubsw 48(%rdi), %xmm3, %xmm1 +; AVX1-NEXT: vpmaddubsw 32(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovdqa 80(%rsi), %xmm2 +; AVX1-NEXT: vpmaddubsw 80(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX1-NEXT: vpmaddubsw 64(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vmovdqa 112(%rsi), %xmm3 +; AVX1-NEXT: vpmaddubsw 112(%rdi), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX1-NEXT: vpmaddubsw 96(%rdi), %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/pr22774.ll =================================================================== --- test/CodeGen/X86/pr22774.ll +++ test/CodeGen/X86/pr22774.ll @@ -7,11 +7,9 @@ define i32 @_Z3foov() { ; CHECK-LABEL: _Z3foov: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovdqa {{.*}}(%rip), %ymm0 -; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; CHECK-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovaps %xmm0, {{.*}}(%rip) ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = load <4 x i64>, <4 x i64>* @in, align 32 Index: test/CodeGen/X86/pr34653.ll =================================================================== --- test/CodeGen/X86/pr34653.ll +++ test/CodeGen/X86/pr34653.ll @@ -12,68 +12,74 @@ ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-512, %rsp # imm = 0xFE00 -; CHECK-NEXT: subq $1536, %rsp # imm = 0x600 +; CHECK-NEXT: subq $2048, %rsp # imm = 0x800 ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: callq test ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovaps %xmm0, %xmm1 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm2 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 -; CHECK-NEXT: vmovaps %xmm3, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vmovaps %xmm5, %xmm6 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm7 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm8 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm9 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm10 -; CHECK-NEXT: vextractf32x4 $3, %zmm10, %xmm11 -; CHECK-NEXT: vmovaps %xmm11, %xmm12 -; CHECK-NEXT: vextractf32x4 $2, %zmm10, %xmm13 -; CHECK-NEXT: vmovaps %xmm13, %xmm14 -; CHECK-NEXT: vmovaps %xmm10, %xmm15 -; CHECK-NEXT: vmovaps %zmm15, %zmm16 -; CHECK-NEXT: vextractf32x4 $3, %zmm9, %xmm2 -; CHECK-NEXT: vmovaps %zmm2, %zmm17 -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vextractf32x4 $2, %zmm9, %xmm0 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm4 +; CHECK-NEXT: vmovaps %xmm4, %xmm5 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm6 +; CHECK-NEXT: vmovaps %xmm6, %xmm7 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm8 +; CHECK-NEXT: vmovaps %xmm8, %xmm9 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm10 +; CHECK-NEXT: vmovaps %xmm10, %xmm11 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm12 +; CHECK-NEXT: vmovaps %xmm12, %xmm13 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm14 +; CHECK-NEXT: vmovaps %xmm14, %xmm15 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vmovaps %zmm0, %zmm16 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vmovaps %zmm0, %zmm17 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovaps %zmm0, %zmm18 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm9, %xmm0 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovaps %zmm0, %zmm19 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf32x4 $3, %zmm8, %xmm0 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovaps %zmm0, %zmm20 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf32x4 $2, %zmm8, %xmm0 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovaps %zmm0, %zmm21 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm8, %xmm0 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovaps %zmm0, %zmm22 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf32x4 $3, %zmm7, %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm23 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf32x4 $2, %zmm7, %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm24 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm7, %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm25 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0] -; CHECK-NEXT: # kill: def $ymm10 killed $ymm10 killed $zmm10 -; CHECK-NEXT: vextractf128 $1, %ymm10, %xmm10 -; CHECK-NEXT: vmovaps %zmm10, %zmm26 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vmovaps %zmm0, %zmm23 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm12 = xmm12[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm14 = xmm14[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: # kill: def $ymm9 killed $ymm9 killed $zmm9 -; CHECK-NEXT: vextractf128 $1, %ymm9, %xmm9 -; CHECK-NEXT: vmovaps %zmm9, %zmm27 ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] @@ -83,9 +89,6 @@ ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: # kill: def $ymm8 killed $ymm8 killed $zmm8 -; CHECK-NEXT: vextractf128 $1, %ymm8, %xmm8 -; CHECK-NEXT: vmovaps %zmm8, %zmm28 ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] @@ -95,16 +98,30 @@ ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: # kill: def $ymm7 killed $ymm7 killed $zmm7 -; CHECK-NEXT: vextractf128 $1, %ymm7, %xmm7 -; CHECK-NEXT: vmovaps %zmm7, %zmm29 ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 8-byte Reload +; CHECK-NEXT: # xmm24 = mem[0],zero +; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 8-byte Reload +; CHECK-NEXT: # xmm25 = mem[0],zero +; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 8-byte Reload +; CHECK-NEXT: # xmm26 = mem[0],zero +; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 8-byte Reload +; CHECK-NEXT: # xmm27 = mem[0],zero +; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 8-byte Reload +; CHECK-NEXT: # xmm28 = mem[0],zero +; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 8-byte Reload +; CHECK-NEXT: # xmm29 = mem[0],zero ; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 8-byte Reload ; CHECK-NEXT: # xmm30 = mem[0],zero ; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 8-byte Reload @@ -131,11 +148,13 @@ ; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -146,22 +165,20 @@ ; CHECK-NEXT: vmovsd %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: vmovsd %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovsd %xmm7, (%rsp) # 8-byte Spill ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -293,16 +293,14 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB1_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqa a+1024(%rax), %ymm2 -; AVX1-NEXT: vmovdqa b+1024(%rax), %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm2 +; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm3 +; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: addq $4, %rax ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %middle.block @@ -1344,17 +1342,14 @@ ; ; AVX1-LABEL: sad_nonloop_32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %ymm0 -; AVX1-NEXT: vmovdqu (%rdx), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: sad_nonloop_32i8: Index: test/CodeGen/X86/sandybridge-loads.ll =================================================================== --- test/CodeGen/X86/sandybridge-loads.ll +++ test/CodeGen/X86/sandybridge-loads.ll @@ -30,9 +30,10 @@ ; CHECK-LABEL: widestores: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vmovaps (%rsi), %ymm1 +; CHECK-NEXT: vmovaps (%rsi), %xmm1 +; CHECK-NEXT: vmovaps 16(%rsi), %xmm2 ; CHECK-NEXT: vmovaps %ymm0, (%rsi) -; CHECK-NEXT: vextractf128 $1, %ymm1, 16(%rdi) +; CHECK-NEXT: vmovaps %xmm2, 16(%rdi) ; CHECK-NEXT: vmovaps %xmm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq Index: test/CodeGen/X86/shrink_vmul.ll =================================================================== --- test/CodeGen/X86/shrink_vmul.ll +++ test/CodeGen/X86/shrink_vmul.ll @@ -2216,68 +2216,65 @@ ; X86-AVX1-NEXT: pushl %ebx ; X86-AVX1-NEXT: pushl %edi ; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: subl $8, %esp +; X86-AVX1-NEXT: subl $16, %esp ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: vmovdqa (%eax), %ymm2 -; X86-AVX1-NEXT: vmovdqa (%ecx), %ymm1 -; X86-AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; X86-AVX1-NEXT: vmovd %xmm2, %eax +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vmovd %xmm1, %eax ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl 32(%ecx) ; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; X86-AVX1-NEXT: vpextrd $3, %xmm3, %eax +; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax +; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm3 +; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm1 +; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; X86-AVX1-NEXT: vpextrd $2, %xmm3, %eax +; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax +; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, %edi -; X86-AVX1-NEXT: vpextrd $1, %xmm1, %ecx -; X86-AVX1-NEXT: vpextrd $1, %xmm3, %eax +; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax +; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, %ebx -; X86-AVX1-NEXT: vmovd %xmm1, %ecx -; X86-AVX1-NEXT: vmovd %xmm3, %eax +; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-AVX1-NEXT: vmovd %xmm2, %eax +; X86-AVX1-NEXT: vmovd %xmm3, %ecx ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx ; X86-AVX1-NEXT: movl %edx, %ebp ; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax -; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, %ecx +; X86-AVX1-NEXT: movl %edx, %ebx ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi ; X86-AVX1-NEXT: divl %esi ; X86-AVX1-NEXT: movl %edx, %esi -; X86-AVX1-NEXT: vmovd %ebp, %xmm2 ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax -; X86-AVX1-NEXT: vpextrd $1, %xmm1, %ebp -; X86-AVX1-NEXT: divl %ebp -; X86-AVX1-NEXT: movl %edx, %ebp -; X86-AVX1-NEXT: vpinsrd $1, %ebx, %xmm2, %xmm2 -; X86-AVX1-NEXT: vmovd %xmm0, %eax -; X86-AVX1-NEXT: vpinsrd $2, %edi, %xmm2, %xmm0 -; X86-AVX1-NEXT: vmovd %xmm1, %edi -; X86-AVX1-NEXT: vpinsrd $3, (%esp), %xmm0, %xmm0 # 4-byte Folded Reload -; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi ; X86-AVX1-NEXT: divl %edi -; X86-AVX1-NEXT: vmovd %edx, %xmm1 -; X86-AVX1-NEXT: vpinsrd $1, %ebp, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 +; X86-AVX1-NEXT: movl %edx, %edi +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: vmovd %xmm1, %ecx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: vmovd %edx, %xmm0 +; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %ebp, %xmm1 +; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload ; X86-AVX1-NEXT: vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Folded Reload ; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero ; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 @@ -2285,11 +2282,11 @@ ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] ; X86-AVX1-NEXT: vpmulld %xmm4, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1 ; X86-AVX1-NEXT: vmovd %xmm1, (%eax) ; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) -; X86-AVX1-NEXT: addl $8, %esp +; X86-AVX1-NEXT: addl $16, %esp ; X86-AVX1-NEXT: popl %esi ; X86-AVX1-NEXT: popl %edi ; X86-AVX1-NEXT: popl %ebx @@ -2303,13 +2300,12 @@ ; X86-AVX2-NEXT: pushl %esi ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: vmovdqa (%eax), %ymm2 -; X86-AVX2-NEXT: vmovdqa (%esi), %ymm1 -; X86-AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2 +; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3 ; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx -; X86-AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 ; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax ; X86-AVX2-NEXT: xorl %edx, %edx ; X86-AVX2-NEXT: divl %ecx @@ -2330,29 +2326,28 @@ ; X86-AVX2-NEXT: xorl %edx, %edx ; X86-AVX2-NEXT: divl %ecx ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 -; X86-AVX2-NEXT: vpextrd $1, %xmm1, %ecx -; X86-AVX2-NEXT: vpextrd $1, %xmm2, %eax +; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx +; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax ; X86-AVX2-NEXT: xorl %edx, %edx ; X86-AVX2-NEXT: divl %ecx ; X86-AVX2-NEXT: movl %edx, %ecx -; X86-AVX2-NEXT: vmovd %xmm1, %edi -; X86-AVX2-NEXT: vmovd %xmm2, %eax +; X86-AVX2-NEXT: vmovd %xmm2, %edi +; X86-AVX2-NEXT: vmovd %xmm1, %eax ; X86-AVX2-NEXT: xorl %edx, %edx ; X86-AVX2-NEXT: divl %edi ; X86-AVX2-NEXT: vmovd %edx, %xmm4 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 -; X86-AVX2-NEXT: vpextrd $2, %xmm1, %ecx -; X86-AVX2-NEXT: vpextrd $2, %xmm2, %eax +; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx +; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax ; X86-AVX2-NEXT: xorl %edx, %edx ; X86-AVX2-NEXT: divl %ecx ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 -; X86-AVX2-NEXT: vpextrd $3, %xmm1, %ecx -; X86-AVX2-NEXT: vpextrd $3, %xmm2, %eax +; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx +; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax ; X86-AVX2-NEXT: xorl %edx, %edx ; X86-AVX2-NEXT: divl %ecx ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 ; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: xorl %edx, %edx ; X86-AVX2-NEXT: divl 32(%esi) @@ -2467,39 +2462,36 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: pushq %rbp ; X64-AVX1-NEXT: pushq %rbx -; X64-AVX1-NEXT: vmovdqa (%rdi), %ymm2 -; X64-AVX1-NEXT: vmovdqa (%rsi), %ymm1 -; X64-AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; X64-AVX1-NEXT: vmovd %xmm2, %eax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vmovd %xmm1, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl 32(%rsi) ; X64-AVX1-NEXT: movl %edx, %r8d -; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; X64-AVX1-NEXT: vpextrd $3, %xmm3, %eax +; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax +; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm3 +; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 +; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx ; X64-AVX1-NEXT: movl %edx, %r9d -; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; X64-AVX1-NEXT: vpextrd $2, %xmm3, %eax +; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax +; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx ; X64-AVX1-NEXT: movl %edx, %r10d -; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ecx -; X64-AVX1-NEXT: vpextrd $1, %xmm3, %eax +; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax +; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx ; X64-AVX1-NEXT: movl %edx, %r11d -; X64-AVX1-NEXT: vmovd %xmm1, %ecx -; X64-AVX1-NEXT: vmovd %xmm3, %eax +; X64-AVX1-NEXT: vmovd %xmm2, %eax +; X64-AVX1-NEXT: vmovd %xmm3, %ecx ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx ; X64-AVX1-NEXT: movl %edx, %esi ; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax -; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx @@ -2518,18 +2510,18 @@ ; X64-AVX1-NEXT: vmovd %xmm1, %ebp ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ebp -; X64-AVX1-NEXT: vmovd %esi, %xmm0 -; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %edx, %xmm0 +; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovd %edx, %xmm2 -; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2 +; X64-AVX1-NEXT: vmovd %esi, %xmm2 +; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vmovd %r8d, %xmm1 ; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 ; X64-AVX1-NEXT: vmovd %eax, %xmm2 @@ -2543,13 +2535,12 @@ ; ; X64-AVX2-LABEL: PR34947: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovdqa (%rdi), %ymm2 -; X64-AVX2-NEXT: vmovdqa (%rsi), %ymm1 -; X64-AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 -; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2 +; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3 ; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx -; X64-AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 ; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax ; X64-AVX2-NEXT: xorl %edx, %edx ; X64-AVX2-NEXT: divl %ecx @@ -2570,29 +2561,28 @@ ; X64-AVX2-NEXT: xorl %edx, %edx ; X64-AVX2-NEXT: divl %ecx ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 -; X64-AVX2-NEXT: vpextrd $1, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrd $1, %xmm2, %eax +; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx +; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax ; X64-AVX2-NEXT: xorl %edx, %edx ; X64-AVX2-NEXT: divl %ecx ; X64-AVX2-NEXT: movl %edx, %ecx -; X64-AVX2-NEXT: vmovd %xmm1, %edi -; X64-AVX2-NEXT: vmovd %xmm2, %eax +; X64-AVX2-NEXT: vmovd %xmm2, %edi +; X64-AVX2-NEXT: vmovd %xmm1, %eax ; X64-AVX2-NEXT: xorl %edx, %edx ; X64-AVX2-NEXT: divl %edi ; X64-AVX2-NEXT: vmovd %edx, %xmm4 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 -; X64-AVX2-NEXT: vpextrd $2, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrd $2, %xmm2, %eax +; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx +; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax ; X64-AVX2-NEXT: xorl %edx, %edx ; X64-AVX2-NEXT: divl %ecx ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 -; X64-AVX2-NEXT: vpextrd $3, %xmm1, %ecx -; X64-AVX2-NEXT: vpextrd $3, %xmm2, %eax +; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx +; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax ; X64-AVX2-NEXT: xorl %edx, %edx ; X64-AVX2-NEXT: divl %ecx ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 ; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: xorl %edx, %edx ; X64-AVX2-NEXT: divl 32(%rsi) Index: test/CodeGen/X86/shuffle-strided-with-offset-256.ll =================================================================== --- test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -8,40 +8,26 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v16i8_1: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v16i8_1: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovdqa %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v16i8_1: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512-LABEL: shuffle_v32i8_to_v16i8_1: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> @@ -50,40 +36,26 @@ } define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind { -; AVX1-LABEL: shuffle_v16i16_to_v8i16_1: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v16i16_to_v8i16_1: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovdqa %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v16i16_to_v8i16_1: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512-LABEL: shuffle_v16i16_to_v8i16_1: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -94,20 +66,16 @@ define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind { ; AVX-LABEL: shuffle_v8i32_to_v4i32_1: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] ; AVX-NEXT: vmovaps %xmm0, (%rsi) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: shuffle_v8i32_to_v4i32_1: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] ; AVX512-NEXT: vmovaps %xmm0, (%rsi) -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %L %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -116,76 +84,59 @@ } define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v8i8_1: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v8i8_1: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v8i8_1: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> @@ -194,76 +145,59 @@ } define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v8i8_2: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v8i8_2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v8i8_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> @@ -272,76 +206,59 @@ } define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v8i8_3: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v8i8_3: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v8i8_3: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> @@ -352,89 +269,76 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) -; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -445,83 +349,68 @@ define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) -; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -532,89 +421,76 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) -; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -623,76 +499,59 @@ } define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_1: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_1: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_1: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> @@ -701,76 +560,59 @@ } define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_2: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> @@ -779,76 +621,59 @@ } define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_3: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_3: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_3: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> @@ -857,70 +682,51 @@ } define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_4: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_4: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> @@ -929,76 +735,59 @@ } define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_5: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_5: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_5: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> @@ -1007,76 +796,59 @@ } define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_6: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_6: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_6: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> @@ -1085,76 +857,59 @@ } define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8_7: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8_7: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8_7: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> Index: test/CodeGen/X86/shuffle-strided-with-offset-512.ll =================================================================== --- test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -31,8 +31,8 @@ ; ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -43,8 +43,8 @@ ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] @@ -85,8 +85,8 @@ ; ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -97,11 +97,10 @@ ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L @@ -113,9 +112,8 @@ define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind { ; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %zmm0 -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7] ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vmovaps %ymm0, (%rsi) ; AVX512F-NEXT: vzeroupper @@ -123,19 +121,17 @@ ; ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 +; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps (%rdi), %zmm0 -; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7] ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -143,11 +139,10 @@ ; ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %L @@ -157,81 +152,23 @@ } define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v16i8_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v16i8_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> store <16 x i8> %strided.vec, <16 x i8>* %S @@ -239,81 +176,23 @@ } define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v16i8_2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_2: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_2: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v16i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> store <16 x i8> %strided.vec, <16 x i8>* %S @@ -321,81 +200,23 @@ } define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v16i8_3: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_3: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_3: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v16i8_3: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> store <16 x i8> %strided.vec, <16 x i8>* %S @@ -405,70 +226,62 @@ define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L @@ -480,70 +293,62 @@ define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[3,1,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L @@ -555,70 +360,62 @@ define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[3,1,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_3: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L @@ -630,78 +427,74 @@ define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11] +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> @@ -712,68 +505,64 @@ define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L @@ -785,78 +574,74 @@ define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11] +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> @@ -867,68 +652,64 @@ define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_4: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L @@ -940,78 +721,74 @@ define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> @@ -1022,68 +799,64 @@ define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L @@ -1095,78 +868,74 @@ define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> Index: test/CodeGen/X86/shuffle-vs-trunc-256.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -13,40 +13,26 @@ ; Ideally, the shuffles should be lowered to code with the same quality as the truncates. define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovdqa %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512-LABEL: shuffle_v32i8_to_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> @@ -55,29 +41,16 @@ } define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX1-LABEL: trunc_v16i16_to_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v16i16_to_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovdqa %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: trunc_v16i16_to_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_v16i16_to_v16i8: ; AVX512F: # %bb.0: @@ -122,40 +95,26 @@ } define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { -; AVX1-LABEL: shuffle_v16i16_to_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v16i16_to_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovdqa %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v16i16_to_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512-LABEL: shuffle_v16i16_to_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -166,14 +125,13 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX1-LABEL: trunc_v8i32_to_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i32_to_v8i16: @@ -231,20 +189,16 @@ define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { ; AVX-LABEL: shuffle_v8i32_to_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX-NEXT: vmovaps %xmm0, (%rsi) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512-LABEL: shuffle_v8i32_to_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512-NEXT: vmovaps %xmm0, (%rsi) -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %L %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -255,11 +209,9 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { ; AVX1-LABEL: trunc_v4i64_to_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX1-NEXT: vmovaps %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: @@ -322,88 +274,70 @@ } define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VBMIVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VBMIVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VBMIVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> @@ -414,14 +348,13 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX1-LABEL: trunc_v8i32_to_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i32_to_v8i8: @@ -1041,92 +974,75 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-LABEL: shuffle_v16i16_to_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) -; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512VBMIVL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -1137,12 +1053,10 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-LABEL: trunc_v4i64_to_v4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16: @@ -1209,79 +1123,58 @@ } define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vmovd %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512VBMIVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> @@ -1292,12 +1185,10 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX1-LABEL: trunc_v4i64_to_v4i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8: Index: test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -37,8 +37,8 @@ ; ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -49,8 +49,8 @@ ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] @@ -61,8 +61,8 @@ ; ; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] ; AVX512VBMI-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -73,11 +73,10 @@ ; ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] -; AVX512VBMIVL-NEXT: vpermi2b %ymm1, %ymm0, %ymm2 -; AVX512VBMIVL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L @@ -171,8 +170,8 @@ ; ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] ; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 @@ -184,18 +183,17 @@ ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0 @@ -207,11 +205,10 @@ ; ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; AVX512VBMIVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512VBMIVL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L @@ -237,9 +234,8 @@ define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { ; AVX512F-LABEL: shuffle_v16i32_to_v8i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %zmm0 -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] ; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vmovaps %ymm0, (%rsi) ; AVX512F-NEXT: vzeroupper @@ -247,19 +243,17 @@ ; ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] -; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] +; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 +; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps (%rdi), %zmm0 -; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -267,19 +261,17 @@ ; ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] -; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovaps (%rdi), %zmm0 -; AVX512VBMI-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0 +; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] ; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi) ; AVX512VBMI-NEXT: vzeroupper @@ -287,11 +279,10 @@ ; ; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] -; AVX512VBMIVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; AVX512VBMIVL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] +; AVX512VBMIVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %L @@ -317,106 +308,100 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2b %ymm1, %ymm0, %ymm2 -; AVX512VBMIVL-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L @@ -442,99 +427,89 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX512F-LABEL: shuffle_v32i16_to_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512VBMIVL-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> +; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L @@ -560,97 +535,91 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi) -; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512VBMIVL-NEXT: vpmovwb %xmm2, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u> +; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi) ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L @@ -863,12 +832,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) { ; AVX512F-LABEL: PR34175: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -877,12 +844,10 @@ ; ; AVX512VL-LABEL: PR34175: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -891,49 +856,43 @@ ; ; AVX512BW-LABEL: PR34175: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: PR34175: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512BWVL-NEXT: retq ; ; AVX512VBMI-LABEL: PR34175: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VBMI-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512VBMI-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VBMI-NEXT: retq ; ; AVX512VBMIVL-LABEL: PR34175: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VBMIVL-NEXT: retq %v = load <32 x i16>, <32 x i16>* %p, align 2 Index: test/CodeGen/X86/sse2.ll =================================================================== --- test/CodeGen/X86/sse2.ll +++ test/CodeGen/X86/sse2.ll @@ -574,10 +574,8 @@ ; X86-AVX-LABEL: test16: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovaps 96(%eax), %ymm0 -; X86-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: vmovaps 96(%eax), %xmm0 +; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test16: @@ -588,10 +586,8 @@ ; ; X64-AVX-LABEL: test16: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps 96(%rdi), %ymm0 -; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: vmovaps 96(%rdi), %xmm0 +; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X64-AVX-NEXT: retq %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3 %i6 = load <4 x double>, <4 x double>* %i5, align 32 Index: test/CodeGen/X86/var-permute-256.ll =================================================================== --- test/CodeGen/X86/var-permute-256.ll +++ test/CodeGen/X86/var-permute-256.ll @@ -25,20 +25,18 @@ ; ; AVX1-LABEL: var_shuffle_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4 +; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4i64: @@ -90,17 +88,15 @@ ; ; AVX1-LABEL: var_shuffle_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8i32: @@ -455,20 +451,18 @@ ; ; AVX1-LABEL: var_shuffle_v4f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4 +; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4f64: @@ -520,17 +514,15 @@ ; ; AVX1-LABEL: var_shuffle_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8f32: @@ -584,20 +576,18 @@ ; AVX1-LABEL: var_shuffle_v4i64_from_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4 +; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64: @@ -653,17 +643,15 @@ ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8i32_from_v4i32: @@ -1015,20 +1003,18 @@ ; AVX1-LABEL: var_shuffle_v4f64_from_v2f64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4 +; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64: @@ -1084,17 +1070,15 @@ ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8f32_from_v4f32: @@ -1148,11 +1132,9 @@ ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,3,3,3,3,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -3254,44 +3254,27 @@ ; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE41-NEXT: retq ; -; AVX1-LABEL: sitofp_load_4i64_to_4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_load_4i64_to_4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; VEX-LABEL: sitofp_load_4i64_to_4f64: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; VEX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_4i64_to_4f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax ; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax @@ -3307,8 +3290,8 @@ ; ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -3680,17 +3663,17 @@ ; ; AVX1-LABEL: uitofp_load_4i64_to_4f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7] +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_load_4i64_to_4f64: @@ -3952,76 +3935,55 @@ ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; SSE41-NEXT: retq ; -; AVX1-LABEL: sitofp_load_4i64_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_load_4i64_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: sitofp_load_4i64_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_4i64_to_4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_load_4i64_to_4f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32: @@ -4186,128 +4148,97 @@ ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; SSE41-NEXT: retq ; -; AVX1-LABEL: sitofp_load_8i64_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_load_8i64_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; VEX-LABEL: sitofp_load_8i64_to_8f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vmovdqa 32(%rdi), %xmm2 +; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 +; VEX-NEXT: vpextrq $1, %xmm2, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; VEX-NEXT: vmovq %xmm2, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; VEX-NEXT: vmovq %xmm3, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; VEX-NEXT: vpextrq $1, %xmm3, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_8i64_to_8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512F-NEXT: vmovq %xmm3, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_load_8i64_to_8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512VL-NEXT: vmovq %xmm3, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32: @@ -4581,174 +4512,103 @@ ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; SSE41-NEXT: retq ; -; AVX1-LABEL: uitofp_load_4i64_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB76_1 -; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX1-NEXT: jmp .LBB76_3 -; AVX1-NEXT: .LBB76_1: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: .LBB76_3: -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB76_4 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX1-NEXT: jmp .LBB76_6 -; AVX1-NEXT: .LBB76_4: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB76_6: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB76_7 -; AVX1-NEXT: # %bb.8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX1-NEXT: jmp .LBB76_9 -; AVX1-NEXT: .LBB76_7: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB76_9: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB76_10 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB76_10: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_4i64_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB76_1 -; AVX2-NEXT: # %bb.2: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX2-NEXT: jmp .LBB76_3 -; AVX2-NEXT: .LBB76_1: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: .LBB76_3: -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB76_4 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX2-NEXT: jmp .LBB76_6 -; AVX2-NEXT: .LBB76_4: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB76_6: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB76_7 -; AVX2-NEXT: # %bb.8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX2-NEXT: jmp .LBB76_9 -; AVX2-NEXT: .LBB76_7: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB76_9: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB76_10 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; AVX2-NEXT: .LBB76_10: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: uitofp_load_4i64_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm2 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 +; VEX-NEXT: vpextrq $1, %xmm2, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: jmp .LBB76_3 +; VEX-NEXT: .LBB76_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB76_3: +; VEX-NEXT: vmovq %xmm2, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: jmp .LBB76_6 +; VEX-NEXT: .LBB76_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB76_6: +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_7 +; VEX-NEXT: # %bb.8: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: jmp .LBB76_9 +; VEX-NEXT: .LBB76_7: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB76_9: +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_10 +; VEX-NEXT: # %bb.11: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VEX-NEXT: retq +; VEX-NEXT: .LBB76_10: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_4i64_to_4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_4i64_to_4f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32: @@ -5171,320 +5031,193 @@ ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; SSE41-NEXT: retq ; -; AVX1-LABEL: uitofp_load_8i64_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_1 -; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX1-NEXT: jmp .LBB80_3 -; AVX1-NEXT: .LBB80_1: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: .LBB80_3: -; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_4 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 -; AVX1-NEXT: jmp .LBB80_6 -; AVX1-NEXT: .LBB80_4: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm4 -; AVX1-NEXT: .LBB80_6: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_7 -; AVX1-NEXT: # %bb.8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; AVX1-NEXT: jmp .LBB80_9 -; AVX1-NEXT: .LBB80_7: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: .LBB80_9: -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_10 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 -; AVX1-NEXT: jmp .LBB80_12 -; AVX1-NEXT: .LBB80_10: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB80_12: -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_13 -; AVX1-NEXT: # %bb.14: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 -; AVX1-NEXT: jmp .LBB80_15 -; AVX1-NEXT: .LBB80_13: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 -; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: .LBB80_15: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_16 -; AVX1-NEXT: # %bb.17: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 -; AVX1-NEXT: jmp .LBB80_18 -; AVX1-NEXT: .LBB80_16: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 -; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: .LBB80_18: -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vmovq %xmm3, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_19 -; AVX1-NEXT: # %bb.20: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX1-NEXT: jmp .LBB80_21 -; AVX1-NEXT: .LBB80_19: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: .LBB80_21: -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm3, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_22 -; AVX1-NEXT: # %bb.23: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 -; AVX1-NEXT: jmp .LBB80_24 -; AVX1-NEXT: .LBB80_22: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: .LBB80_24: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_8i64_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_1 -; AVX2-NEXT: # %bb.2: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX2-NEXT: jmp .LBB80_3 -; AVX2-NEXT: .LBB80_1: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: .LBB80_3: -; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_4 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 -; AVX2-NEXT: jmp .LBB80_6 -; AVX2-NEXT: .LBB80_4: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm4 -; AVX2-NEXT: .LBB80_6: -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_7 -; AVX2-NEXT: # %bb.8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; AVX2-NEXT: jmp .LBB80_9 -; AVX2-NEXT: .LBB80_7: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: .LBB80_9: -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_10 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 -; AVX2-NEXT: jmp .LBB80_12 -; AVX2-NEXT: .LBB80_10: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB80_12: -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_13 -; AVX2-NEXT: # %bb.14: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 -; AVX2-NEXT: jmp .LBB80_15 -; AVX2-NEXT: .LBB80_13: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 -; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: .LBB80_15: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_16 -; AVX2-NEXT: # %bb.17: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 -; AVX2-NEXT: jmp .LBB80_18 -; AVX2-NEXT: .LBB80_16: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 -; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: .LBB80_18: -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_19 -; AVX2-NEXT: # %bb.20: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX2-NEXT: jmp .LBB80_21 -; AVX2-NEXT: .LBB80_19: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: .LBB80_21: -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_22 -; AVX2-NEXT: # %bb.23: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 -; AVX2-NEXT: jmp .LBB80_24 -; AVX2-NEXT: .LBB80_22: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 -; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: .LBB80_24: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; VEX-LABEL: uitofp_load_8i64_to_8f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm1 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 +; VEX-NEXT: vmovdqa 32(%rdi), %xmm4 +; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 +; VEX-NEXT: vpextrq $1, %xmm4, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_1 +; VEX-NEXT: # %bb.2: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VEX-NEXT: jmp .LBB80_3 +; VEX-NEXT: .LBB80_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB80_3: +; VEX-NEXT: vmovq %xmm4, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_4 +; VEX-NEXT: # %bb.5: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 +; VEX-NEXT: jmp .LBB80_6 +; VEX-NEXT: .LBB80_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm5 +; VEX-NEXT: .LBB80_6: +; VEX-NEXT: vmovq %xmm3, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_7 +; VEX-NEXT: # %bb.8: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; VEX-NEXT: jmp .LBB80_9 +; VEX-NEXT: .LBB80_7: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; VEX-NEXT: .LBB80_9: +; VEX-NEXT: vpextrq $1, %xmm3, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_10 +; VEX-NEXT: # %bb.11: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; VEX-NEXT: jmp .LBB80_12 +; VEX-NEXT: .LBB80_10: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; VEX-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; VEX-NEXT: .LBB80_12: +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_13 +; VEX-NEXT: # %bb.14: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm6 +; VEX-NEXT: jmp .LBB80_15 +; VEX-NEXT: .LBB80_13: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm6 +; VEX-NEXT: vaddss %xmm6, %xmm6, %xmm6 +; VEX-NEXT: .LBB80_15: +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_16 +; VEX-NEXT: # %bb.17: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm1 +; VEX-NEXT: jmp .LBB80_18 +; VEX-NEXT: .LBB80_16: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB80_18: +; VEX-NEXT: vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3] +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3] +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_19 +; VEX-NEXT: # %bb.20: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm2 +; VEX-NEXT: jmp .LBB80_21 +; VEX-NEXT: .LBB80_19: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB80_21: +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3] +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_22 +; VEX-NEXT: # %bb.23: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm0 +; VEX-NEXT: jmp .LBB80_24 +; VEX-NEXT: .LBB80_22: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: .LBB80_24: +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_8i64_to_8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512F-NEXT: vmovq %xmm3, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_8i64_to_8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512VL-NEXT: vmovq %xmm3, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32: @@ -5547,9 +5280,10 @@ ; ; AVX1-LABEL: uitofp_load_8i32_to_8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 Index: test/CodeGen/X86/vector-compare-results.ll =================================================================== --- test/CodeGen/X86/vector-compare-results.ll +++ test/CodeGen/X86/vector-compare-results.ll @@ -7509,57 +7509,41 @@ ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $32, %rsp -; AVX1-NEXT: vmovdqa 240(%rbp), %ymm8 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm10 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm10, %xmm9 -; AVX1-NEXT: vmovdqa 208(%rbp), %ymm10 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm7, %xmm7 -; AVX1-NEXT: vpackssdw %xmm9, %xmm7, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vpcmpgtq 256(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vpcmpgtq 240(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vpackssdw %xmm8, %xmm7, %xmm8 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm10, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa 176(%rbp), %ymm9 +; AVX1-NEXT: vpcmpgtq 224(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vpcmpgtq 208(%rbp), %xmm6, %xmm6 ; AVX1-NEXT: vpackssdw %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackssdw %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa 144(%rbp), %ymm10 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm5, %xmm5 -; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm6 +; AVX1-NEXT: vpackssdw %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX1-NEXT: vpcmpgtq 192(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vpcmpgtq 176(%rbp), %xmm5, %xmm5 +; AVX1-NEXT: vpackssdw %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm10, %xmm4, %xmm4 -; AVX1-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa 112(%rbp), %ymm6 +; AVX1-NEXT: vpcmpgtq 160(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vpcmpgtq 144(%rbp), %xmm4, %xmm4 +; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm8, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vmovdqa 80(%rbp), %ymm7 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpacksswb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpcmpgtq 128(%rbp), %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq 112(%rbp), %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa 48(%rbp), %ymm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpcmpgtq 96(%rbp), %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq 80(%rbp), %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vmovdqa 16(%rbp), %ymm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq 64(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq 48(%rbp), %xmm1, %xmm1 ; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq 32(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq 16(%rbp), %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 Index: test/CodeGen/X86/vector-shuffle-combining-avx.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -440,9 +440,8 @@ ; X32-AVX1-LABEL: PR39483: ; X32-AVX1: # %bb.0: # %entry ; X32-AVX1-NEXT: vmovups 32, %ymm0 -; X32-AVX1-NEXT: vmovups 64, %ymm1 -; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X32-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,3] +; X32-AVX1-NEXT: vmovups 64, %xmm1 +; X32-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,3] ; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; X32-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 @@ -484,9 +483,8 @@ ; X64-AVX1-LABEL: PR39483: ; X64-AVX1: # %bb.0: # %entry ; X64-AVX1-NEXT: vmovups 32, %ymm0 -; X64-AVX1-NEXT: vmovups 64, %ymm1 -; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,3] +; X64-AVX1-NEXT: vmovups 64, %xmm1 +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,3] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -1988,19 +1988,18 @@ ; ; AVX1-LABEL: PR34773: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %ymm0 -; AVX1-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rsi) ; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR34773: Index: test/CodeGen/X86/viabs.ll =================================================================== --- test/CodeGen/X86/viabs.ll +++ test/CodeGen/X86/viabs.ll @@ -643,24 +643,24 @@ ; ; AVX1-LABEL: test_abs_le_v8i64_fold: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %ymm0 -; AVX1-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm5 +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm7, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm5 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/widen_load-3.ll =================================================================== --- test/CodeGen/X86/widen_load-3.ll +++ test/CodeGen/X86/widen_load-3.ll @@ -30,12 +30,12 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: vmovaps (%ecx), %ymm0 -; X86-AVX-NEXT: vmovaps 32(%ecx), %ymm1 -; X86-AVX-NEXT: vmovaps %ymm0, (%eax) -; X86-AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 -; X86-AVX-NEXT: vextractps $1, %xmm0, 52(%eax) -; X86-AVX-NEXT: vmovss %xmm0, 48(%eax) +; X86-AVX-NEXT: vmovaps 48(%ecx), %xmm1 +; X86-AVX-NEXT: vextractps $1, %xmm1, 52(%eax) +; X86-AVX-NEXT: vmovss %xmm1, 48(%eax) +; X86-AVX-NEXT: vmovaps 32(%ecx), %xmm1 ; X86-AVX-NEXT: vmovaps %xmm1, 32(%eax) +; X86-AVX-NEXT: vmovaps %ymm0, (%eax) ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl $4 ; @@ -56,11 +56,11 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: movq %rdi, %rax ; X64-AVX-NEXT: vmovaps (%rsi), %ymm0 -; X64-AVX-NEXT: vmovaps 32(%rsi), %ymm1 -; X64-AVX-NEXT: vmovaps %ymm0, (%rdi) -; X64-AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 -; X64-AVX-NEXT: vmovlps %xmm0, 48(%rdi) +; X64-AVX-NEXT: movq 48(%rsi), %rcx +; X64-AVX-NEXT: movq %rcx, 48(%rdi) +; X64-AVX-NEXT: vmovaps 32(%rsi), %xmm1 ; X64-AVX-NEXT: vmovaps %xmm1, 32(%rdi) +; X64-AVX-NEXT: vmovaps %ymm0, (%rdi) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq %x1 = load <7 x i64>, <7 x i64>* %x Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -389,61 +389,32 @@ } define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) { -; AVX1-LABEL: interleaved_load_vf8_i8_stride4: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7] -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX1-NEXT: vpaddw %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1] -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2OR512-LABEL: interleaved_load_vf8_i8_stride4: -; AVX2OR512: # %bb.0: -; AVX2OR512-NEXT: vmovdqu (%rdi), %ymm0 -; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2OR512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2OR512-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2OR512-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0] -; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7] -; AVX2OR512-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2OR512-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX2OR512-NEXT: vpaddw %xmm1, %xmm4, %xmm1 -; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15] -; AVX2OR512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2OR512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7] -; AVX2OR512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2OR512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7] -; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1] -; AVX2OR512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2OR512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2OR512-NEXT: vpaddw %xmm3, %xmm0, %xmm0 -; AVX2OR512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX2OR512-NEXT: vzeroupper -; AVX2OR512-NEXT: retq +; AVX-LABEL: interleaved_load_vf8_i8_stride4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm0[0],xmm3[0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7] +; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVX-NEXT: vpaddw %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,3,2,4,5,6,7] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1] +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16 %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> @@ -459,146 +430,144 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) { ; AVX1-LABEL: interleaved_load_vf16_i8_stride4: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_load_vf16_i8_stride4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm6 -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: vpcmpeqb %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm4, %xmm0 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_load_vf16_i8_stride4: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm3[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm7 +; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm7, %xmm4, %xmm3 +; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm4 ; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX512-NEXT: vpcmpeqb %zmm5, %zmm8, %k0 -; AVX512-NEXT: vpcmpeqb %zmm0, %zmm3, %k1 +; AVX512-NEXT: vpcmpeqb %zmm0, %zmm4, %k1 ; AVX512-NEXT: kxnorw %k1, %k0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -620,84 +589,84 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { ; AVX1-LABEL: interleaved_load_vf32_i8_stride4: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm11 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX1-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX1-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm12 -; AVX1-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm13 -; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm4 -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm11 +; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm3 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm8 -; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm15 -; AVX1-NEXT: vpshufb %xmm6, %xmm15, %xmm5 -; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm6 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm6 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm4 -; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 +; AVX1-NEXT: vmovdqa (%rdi), %xmm13 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm5 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm5, %xmm13, %xmm1 -; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm4 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm4 -; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm4 -; AVX1-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm5 -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm7 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm4 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm5 -; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm14, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm4 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm1 @@ -716,192 +685,188 @@ ; ; AVX2-LABEL: interleaved_load_vf32_i8_stride4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX2-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm13 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-NEXT: vpshufb %xmm6, %xmm9, %xmm3 -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm10 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],xmm4[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm12 -; AVX2-NEXT: vpshufb %xmm6, %xmm12, %xmm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1] +; AVX2-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX2-NEXT: vpshufb %xmm6, %xmm12, %xmm5 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm7 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3] +; AVX2-NEXT: vmovdqa 112(%rdi), %xmm14 +; AVX2-NEXT: vpshufb %xmm6, %xmm14, %xmm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = mem[2,3,0,1] +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 +; AVX2-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = mem[2,3,0,1] ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm0 +; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX2-NEXT: vpshufb %xmm1, %xmm14, %xmm3 +; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm8 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm2 -; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm3 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm4 +; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-NEXT: vpshufb %xmm1, %xmm14, %xmm2 +; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm2 ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpcmpeqb %ymm0, %ymm10, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm8, %ymm0 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_load_vf32_i8_stride4: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm10 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm3 -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm5 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0,1],xmm4[2,3] -; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm5 -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm12 -; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14 -; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1] +; AVX512-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm5 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm7 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3] +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14 +; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = mem[2,3,0,1] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = mem[2,3,0,1] ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm4 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm6 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm5 -; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3 +; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm4 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm6 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm5 -; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3 +; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm1 -; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm3 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm4 +; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm0 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm2 +; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm3 -; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm2 +; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpcmpeqb %zmm9, %zmm8, %k0 -; AVX512-NEXT: vpcmpeqb %zmm0, %zmm2, %k1 +; AVX512-NEXT: vpcmpeqb %zmm0, %zmm15, %k1 ; AVX512-NEXT: kxnord %k1, %k0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 @@ -1072,41 +1037,22 @@ } define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){ -; AVX1-LABEL: interleaved_load_vf8_i8_stride3: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] -; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2OR512-LABEL: interleaved_load_vf8_i8_stride3: -; AVX2OR512: # %bb.0: -; AVX2OR512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2OR512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] -; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] -; AVX2OR512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] -; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX2OR512-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] -; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] -; AVX2OR512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2OR512-NEXT: vpaddw %xmm0, %xmm3, %xmm0 -; AVX2OR512-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX2OR512-NEXT: vzeroupper -; AVX2OR512-NEXT: retq +; AVX-LABEL: interleaved_load_vf8_i8_stride3: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u] +; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX-NEXT: retq %wide.vec = load <24 x i8>, <24 x i8>* %ptr %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32>