diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -7036,7 +7036,7 @@ unsigned InNumElts = InEC.getFixedValue(); unsigned WidenNumElts = WidenEC.getFixedValue(); - // Fall back to extract and build. + // Fall back to extract and build (+ mask, if padding with zeros). SmallVector Ops(WidenNumElts); EVT EltVT = NVT.getVectorElementType(); unsigned MinNumElts = std::min(WidenNumElts, InNumElts); @@ -7045,9 +7045,21 @@ Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp, DAG.getVectorIdxConstant(Idx, dl)); - SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : - DAG.getUNDEF(EltVT); - for ( ; Idx < WidenNumElts; ++Idx) - Ops[Idx] = FillVal; - return DAG.getBuildVector(NVT, dl, Ops); + SDValue UndefVal = DAG.getUNDEF(EltVT); + for (; Idx < WidenNumElts; ++Idx) + Ops[Idx] = UndefVal; + + SDValue Widened = DAG.getBuildVector(NVT, dl, Ops); + if (!FillWithZeroes) + return Widened; + + assert(NVT.isInteger() && + "We expect to never want to FillWithZeroes for non-integral types."); + + SmallVector MaskOps; + MaskOps.append(MinNumElts, DAG.getAllOnesConstant(dl, EltVT)); + MaskOps.append(WidenNumElts - MinNumElts, DAG.getConstant(0, dl, EltVT)); + + return DAG.getNode(ISD::AND, dl, NVT, Widened, + DAG.getBuildVector(NVT, dl, MaskOps)); } diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -2488,11 +2488,8 @@ ; KNL_64-LABEL: test30: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; KNL_64-NEXT: movw $-3, %ax -; KNL_64-NEXT: kmovw %eax, %k0 ; KNL_64-NEXT: andl $1, %edi -; KNL_64-NEXT: kmovw %edi, %k1 -; KNL_64-NEXT: kandw %k0, %k1, %k0 +; KNL_64-NEXT: kmovw %edi, %k0 ; KNL_64-NEXT: kmovw %esi, %k1 ; KNL_64-NEXT: kshiftlw $15, %k1, %k1 ; KNL_64-NEXT: kshiftrw $14, %k1, %k1 @@ -2504,6 +2501,9 @@ ; KNL_64-NEXT: kshiftlw $15, %k1, %k1 ; KNL_64-NEXT: kshiftrw $13, %k1, %k1 ; KNL_64-NEXT: korw %k1, %k0, %k0 +; KNL_64-NEXT: movb $7, %al +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: kandw %k1, %k0, %k0 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 @@ -2517,12 +2517,9 @@ ; KNL_32-LABEL: test30: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: movw $-3, %ax -; KNL_32-NEXT: kmovw %eax, %k0 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: andl $1, %eax -; KNL_32-NEXT: kmovw %eax, %k1 -; KNL_32-NEXT: kandw %k0, %k1, %k0 +; KNL_32-NEXT: kmovw %eax, %k0 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: kmovw %eax, %k1 ; KNL_32-NEXT: kshiftlw $15, %k1, %k1 @@ -2536,6 +2533,9 @@ ; KNL_32-NEXT: kshiftlw $15, %k1, %k1 ; KNL_32-NEXT: kshiftrw $13, %k1, %k1 ; KNL_32-NEXT: korw %k1, %k0, %k0 +; KNL_32-NEXT: movb $7, %al +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kandw %k1, %k0, %k0 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 @@ -2547,23 +2547,23 @@ ; ; SKX-LABEL: test30: ; SKX: # %bb.0: -; SKX-NEXT: movb $-3, %al -; SKX-NEXT: kmovw %eax, %k0 +; SKX-NEXT: kmovw %esi, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 ; SKX-NEXT: kmovw %edi, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 -; SKX-NEXT: kandw %k0, %k1, %k0 -; SKX-NEXT: kmovw %esi, %k1 -; SKX-NEXT: kshiftlb $7, %k1, %k1 -; SKX-NEXT: kshiftrb $6, %k1, %k1 -; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: movb $-5, %al ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: kandw %k1, %k0, %k0 ; SKX-NEXT: kmovw %edx, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: korw %k1, %k0, %k1 +; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: movb $7, %al +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: kandw %k1, %k0, %k1 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 @@ -2574,18 +2574,15 @@ ; ; SKX_32-LABEL: test30: ; SKX_32: # %bb.0: -; SKX_32-NEXT: movb $-3, %al +; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kmovw %eax, %k0 +; SKX_32-NEXT: kshiftlb $7, %k0, %k0 +; SKX_32-NEXT: kshiftrb $6, %k0, %k0 ; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kmovw %eax, %k1 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1 ; SKX_32-NEXT: kshiftrb $7, %k1, %k1 -; SKX_32-NEXT: kandw %k0, %k1, %k0 -; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: kmovw %eax, %k1 -; SKX_32-NEXT: kshiftlb $7, %k1, %k1 -; SKX_32-NEXT: kshiftrb $6, %k1, %k1 -; SKX_32-NEXT: korw %k1, %k0, %k0 +; SKX_32-NEXT: korw %k0, %k1, %k0 ; SKX_32-NEXT: movb $-5, %al ; SKX_32-NEXT: kmovw %eax, %k1 ; SKX_32-NEXT: kandw %k1, %k0, %k0 @@ -2593,7 +2590,10 @@ ; SKX_32-NEXT: kmovw %eax, %k1 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1 ; SKX_32-NEXT: kshiftrb $5, %k1, %k1 -; SKX_32-NEXT: korw %k1, %k0, %k1 +; SKX_32-NEXT: korw %k1, %k0, %k0 +; SKX_32-NEXT: movb $7, %al +; SKX_32-NEXT: kmovw %eax, %k1 +; SKX_32-NEXT: kandw %k1, %k0, %k1 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX_32-NEXT: vpgatherdd (,%xmm0), %xmm2 {%k1} @@ -2612,11 +2612,8 @@ ; KNL_64-LABEL: test30b: ; KNL_64: # %bb.0: ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; KNL_64-NEXT: movw $-3, %ax -; KNL_64-NEXT: kmovw %eax, %k0 ; KNL_64-NEXT: andl $1, %edi -; KNL_64-NEXT: kmovw %edi, %k1 -; KNL_64-NEXT: kandw %k0, %k1, %k0 +; KNL_64-NEXT: kmovw %edi, %k0 ; KNL_64-NEXT: kmovw %esi, %k1 ; KNL_64-NEXT: kshiftlw $15, %k1, %k1 ; KNL_64-NEXT: kshiftrw $14, %k1, %k1 @@ -2628,6 +2625,9 @@ ; KNL_64-NEXT: kshiftlw $15, %k1, %k1 ; KNL_64-NEXT: kshiftrw $13, %k1, %k1 ; KNL_64-NEXT: korw %k1, %k0, %k0 +; KNL_64-NEXT: movb $7, %al +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: kandw %k1, %k0, %k0 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 @@ -2640,12 +2640,9 @@ ; KNL_32-LABEL: test30b: ; KNL_32: # %bb.0: ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; KNL_32-NEXT: movw $-3, %ax -; KNL_32-NEXT: kmovw %eax, %k0 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: andl $1, %eax -; KNL_32-NEXT: kmovw %eax, %k1 -; KNL_32-NEXT: kandw %k0, %k1, %k0 +; KNL_32-NEXT: kmovw %eax, %k0 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: kmovw %eax, %k1 ; KNL_32-NEXT: kshiftlw $15, %k1, %k1 @@ -2659,6 +2656,9 @@ ; KNL_32-NEXT: kshiftlw $15, %k1, %k1 ; KNL_32-NEXT: kshiftrw $13, %k1, %k1 ; KNL_32-NEXT: korw %k1, %k0, %k0 +; KNL_32-NEXT: movb $7, %al +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kandw %k1, %k0, %k0 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 @@ -2669,23 +2669,23 @@ ; ; SKX-LABEL: test30b: ; SKX: # %bb.0: -; SKX-NEXT: movb $-3, %al -; SKX-NEXT: kmovw %eax, %k0 +; SKX-NEXT: kmovw %esi, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 ; SKX-NEXT: kmovw %edi, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $7, %k1, %k1 -; SKX-NEXT: kandw %k0, %k1, %k0 -; SKX-NEXT: kmovw %esi, %k1 -; SKX-NEXT: kshiftlb $7, %k1, %k1 -; SKX-NEXT: kshiftrb $6, %k1, %k1 -; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: movb $-5, %al ; SKX-NEXT: kmovw %eax, %k1 ; SKX-NEXT: kandw %k1, %k0, %k0 ; SKX-NEXT: kmovw %edx, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: korw %k1, %k0, %k1 +; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: movb $7, %al +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: kandw %k1, %k0, %k1 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 @@ -2695,18 +2695,15 @@ ; ; SKX_32-LABEL: test30b: ; SKX_32: # %bb.0: -; SKX_32-NEXT: movb $-3, %al +; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kmovw %eax, %k0 +; SKX_32-NEXT: kshiftlb $7, %k0, %k0 +; SKX_32-NEXT: kshiftrb $6, %k0, %k0 ; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; SKX_32-NEXT: kmovw %eax, %k1 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1 ; SKX_32-NEXT: kshiftrb $7, %k1, %k1 -; SKX_32-NEXT: kandw %k0, %k1, %k0 -; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: kmovw %eax, %k1 -; SKX_32-NEXT: kshiftlb $7, %k1, %k1 -; SKX_32-NEXT: kshiftrb $6, %k1, %k1 -; SKX_32-NEXT: korw %k1, %k0, %k0 +; SKX_32-NEXT: korw %k0, %k1, %k0 ; SKX_32-NEXT: movb $-5, %al ; SKX_32-NEXT: kmovw %eax, %k1 ; SKX_32-NEXT: kandw %k1, %k0, %k0 @@ -2714,7 +2711,10 @@ ; SKX_32-NEXT: kmovw %eax, %k1 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1 ; SKX_32-NEXT: kshiftrb $5, %k1, %k1 -; SKX_32-NEXT: korw %k1, %k0, %k1 +; SKX_32-NEXT: korw %k1, %k0, %k0 +; SKX_32-NEXT: movb $7, %al +; SKX_32-NEXT: kmovw %eax, %k1 +; SKX_32-NEXT: kandw %k1, %k0, %k1 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX_32-NEXT: vpscatterdd %xmm2, (,%xmm0) {%k1} diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -5324,11 +5324,8 @@ ; AVX512F-LABEL: widen_masked_store: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: movw $-3, %ax -; AVX512F-NEXT: kmovw %eax, %k0 ; AVX512F-NEXT: andl $1, %esi -; AVX512F-NEXT: kmovw %esi, %k1 -; AVX512F-NEXT: kandw %k0, %k1, %k0 +; AVX512F-NEXT: kmovw %esi, %k0 ; AVX512F-NEXT: kmovw %edx, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $14, %k1, %k1 @@ -5340,6 +5337,9 @@ ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $13, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movb $7, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} @@ -5348,33 +5348,30 @@ ; ; AVX512VLDQ-LABEL: widen_masked_store: ; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: movb $-3, %al -; AVX512VLDQ-NEXT: kmovw %eax, %k0 +; AVX512VLDQ-NEXT: kmovw %edx, %k0 +; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 ; AVX512VLDQ-NEXT: kmovw %esi, %k1 ; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 ; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1 -; AVX512VLDQ-NEXT: kandw %k0, %k1, %k0 -; AVX512VLDQ-NEXT: kmovw %edx, %k1 -; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 -; AVX512VLDQ-NEXT: kshiftrb $6, %k1, %k1 -; AVX512VLDQ-NEXT: korw %k1, %k0, %k0 +; AVX512VLDQ-NEXT: korw %k0, %k1, %k0 ; AVX512VLDQ-NEXT: movb $-5, %al ; AVX512VLDQ-NEXT: kmovw %eax, %k1 ; AVX512VLDQ-NEXT: kandw %k1, %k0, %k0 ; AVX512VLDQ-NEXT: kmovw %ecx, %k1 ; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 ; AVX512VLDQ-NEXT: kshiftrb $5, %k1, %k1 -; AVX512VLDQ-NEXT: korw %k1, %k0, %k1 +; AVX512VLDQ-NEXT: korw %k1, %k0, %k0 +; AVX512VLDQ-NEXT: movb $7, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: kandw %k1, %k0, %k1 ; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: widen_masked_store: ; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: movw $-3, %ax -; AVX512VLBW-NEXT: kmovd %eax, %k0 ; AVX512VLBW-NEXT: andl $1, %esi -; AVX512VLBW-NEXT: kmovw %esi, %k1 -; AVX512VLBW-NEXT: kandw %k0, %k1, %k0 +; AVX512VLBW-NEXT: kmovw %esi, %k0 ; AVX512VLBW-NEXT: kmovd %edx, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512VLBW-NEXT: kshiftrw $14, %k1, %k1 @@ -5385,29 +5382,32 @@ ; AVX512VLBW-NEXT: kmovd %ecx, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512VLBW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512VLBW-NEXT: korw %k1, %k0, %k1 +; AVX512VLBW-NEXT: korw %k1, %k0, %k0 +; AVX512VLBW-NEXT: movb $7, %al +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: kandw %k1, %k0, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLBW-NEXT: retq ; ; X86-AVX512-LABEL: widen_masked_store: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: movb $-3, %al -; X86-AVX512-NEXT: kmovd %eax, %k0 +; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k0 +; X86-AVX512-NEXT: kshiftlb $7, %k0, %k0 +; X86-AVX512-NEXT: kshiftrb $6, %k0, %k0 ; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1 ; X86-AVX512-NEXT: kshiftrb $7, %k1, %k1 -; X86-AVX512-NEXT: kandw %k0, %k1, %k0 -; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1 -; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1 -; X86-AVX512-NEXT: kshiftrb $6, %k1, %k1 -; X86-AVX512-NEXT: korw %k1, %k0, %k0 +; X86-AVX512-NEXT: korw %k0, %k1, %k0 ; X86-AVX512-NEXT: movb $-5, %al ; X86-AVX512-NEXT: kmovd %eax, %k1 ; X86-AVX512-NEXT: kandw %k1, %k0, %k0 ; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1 ; X86-AVX512-NEXT: kshiftrb $5, %k1, %k1 -; X86-AVX512-NEXT: korw %k1, %k0, %k1 +; X86-AVX512-NEXT: korw %k1, %k0, %k0 +; X86-AVX512-NEXT: movb $7, %al +; X86-AVX512-NEXT: kmovd %eax, %k1 +; X86-AVX512-NEXT: kandw %k1, %k0, %k1 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: vmovdqa32 %xmm0, (%eax) {%k1} ; X86-AVX512-NEXT: retl @@ -6214,17 +6214,14 @@ ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k0 -; AVX512F-NEXT: movw $85, %ax -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: movw $21845, %ax ## imm = 0x5555 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpcmpgtd (%rdi), %zmm2, %k1 {%k1} +; AVX512F-NEXT: movw $85, %ax ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vpcmpgtd (%rdi), %zmm2, %k2 {%k2} -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdx) {%k2} -; AVX512F-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k1} +; AVX512F-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2} +; AVX512F-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k2} +; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdx) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6233,338 +6230,49 @@ ; AVX512VLDQ-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512VLDQ-NEXT: vmovdqa64 64(%rsi), %zmm1 ; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k0 -; AVX512VLDQ-NEXT: movw $85, %ax -; AVX512VLDQ-NEXT: kmovw %eax, %k1 -; AVX512VLDQ-NEXT: kandb %k1, %k0, %k0 -; AVX512VLDQ-NEXT: kmovb %k0, %k1 ; AVX512VLDQ-NEXT: movw $21845, %ax ## imm = 0x5555 +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vpcmpgtd (%rdi), %zmm2, %k1 {%k1} +; AVX512VLDQ-NEXT: movw $85, %ax ; AVX512VLDQ-NEXT: kmovw %eax, %k2 -; AVX512VLDQ-NEXT: vpcmpgtd (%rdi), %zmm2, %k2 {%k2} -; AVX512VLDQ-NEXT: vmovdqu32 %zmm0, (%rdx) {%k2} -; AVX512VLDQ-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k1} +; AVX512VLDQ-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2} +; AVX512VLDQ-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k2} +; AVX512VLDQ-NEXT: vmovdqu32 %zmm0, (%rdx) {%k1} ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: ; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpcmpgtd (%rdi), %zmm0, %k1 -; AVX512VLBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; AVX512VLBW-NEXT: vpcmpgtd 64(%rdi), %zmm0, %k0 -; AVX512VLBW-NEXT: kunpckwd %k1, %k0, %k0 -; AVX512VLBW-NEXT: movl $5592405, %eax ## imm = 0x555555 -; AVX512VLBW-NEXT: kmovd %eax, %k2 -; AVX512VLBW-NEXT: kandd %k2, %k0, %k0 -; AVX512VLBW-NEXT: kshiftrd $21, %k0, %k6 -; AVX512VLBW-NEXT: kshiftrd $20, %k0, %k5 -; AVX512VLBW-NEXT: kshiftrd $19, %k0, %k4 -; AVX512VLBW-NEXT: kshiftrd $18, %k0, %k3 -; AVX512VLBW-NEXT: kshiftrd $16, %k0, %k2 -; AVX512VLBW-NEXT: kshiftrd $17, %k0, %k7 -; AVX512VLBW-NEXT: kshiftlw $15, %k7, %k7 -; AVX512VLBW-NEXT: kshiftrw $14, %k7, %k7 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $15, %k2, %k2 -; AVX512VLBW-NEXT: korw %k7, %k2, %k7 -; AVX512VLBW-NEXT: movw $-5, %ax -; AVX512VLBW-NEXT: kmovd %eax, %k1 -; AVX512VLBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; AVX512VLBW-NEXT: kandw %k1, %k7, %k7 -; AVX512VLBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512VLBW-NEXT: kshiftrw $13, %k3, %k3 -; AVX512VLBW-NEXT: korw %k3, %k7, %k7 -; AVX512VLBW-NEXT: movw $-9, %ax -; AVX512VLBW-NEXT: kmovd %eax, %k3 -; AVX512VLBW-NEXT: kandw %k3, %k7, %k7 -; AVX512VLBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512VLBW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512VLBW-NEXT: korw %k4, %k7, %k7 -; AVX512VLBW-NEXT: movw $-17, %ax -; AVX512VLBW-NEXT: kmovd %eax, %k4 -; AVX512VLBW-NEXT: kandw %k4, %k7, %k7 -; AVX512VLBW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512VLBW-NEXT: kshiftrw $11, %k5, %k5 -; AVX512VLBW-NEXT: korw %k5, %k7, %k7 -; AVX512VLBW-NEXT: movw $-33, %ax -; AVX512VLBW-NEXT: kmovd %eax, %k5 -; AVX512VLBW-NEXT: kandw %k5, %k7, %k7 -; AVX512VLBW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512VLBW-NEXT: kshiftrw $10, %k6, %k6 -; AVX512VLBW-NEXT: korw %k6, %k7, %k7 -; AVX512VLBW-NEXT: movw $-65, %ax -; AVX512VLBW-NEXT: kmovd %eax, %k6 -; AVX512VLBW-NEXT: kandw %k6, %k7, %k7 -; AVX512VLBW-NEXT: kshiftrd $22, %k0, %k1 -; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512VLBW-NEXT: korw %k1, %k7, %k1 -; AVX512VLBW-NEXT: movw $-129, %ax -; AVX512VLBW-NEXT: kmovd %eax, %k7 -; AVX512VLBW-NEXT: kandw %k7, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $23, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $8, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 ; AVX512VLBW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512VLBW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512VLBW-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k1} -; AVX512VLBW-NEXT: kshiftrd $1, %k0, %k1 -; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512VLBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $15, %k2, %k2 -; AVX512VLBW-NEXT: korw %k1, %k2, %k1 -; AVX512VLBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload -; AVX512VLBW-NEXT: kandw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $2, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kandw %k3, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $3, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kandw %k4, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $4, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $11, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kandw %k5, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $5, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kandw %k6, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $6, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $9, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kandw %k7, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $7, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $8, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: movw $-257, %ax ## imm = 0xFEFF -; AVX512VLBW-NEXT: kmovd %eax, %k2 -; AVX512VLBW-NEXT: kandw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $8, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: movw $-513, %ax ## imm = 0xFDFF -; AVX512VLBW-NEXT: kmovd %eax, %k2 -; AVX512VLBW-NEXT: kandw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $9, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: movw $-1025, %ax ## imm = 0xFBFF -; AVX512VLBW-NEXT: kmovd %eax, %k2 -; AVX512VLBW-NEXT: kandw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $10, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $5, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: movw $-2049, %ax ## imm = 0xF7FF -; AVX512VLBW-NEXT: kmovd %eax, %k2 -; AVX512VLBW-NEXT: kandw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $11, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: movw $-4097, %ax ## imm = 0xEFFF -; AVX512VLBW-NEXT: kmovd %eax, %k2 -; AVX512VLBW-NEXT: kandw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $12, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: movw $-8193, %ax ## imm = 0xDFFF -; AVX512VLBW-NEXT: kmovd %eax, %k2 -; AVX512VLBW-NEXT: kandw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $13, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512VLBW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: movw $-16385, %ax ## imm = 0xBFFF +; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-NEXT: movw $21845, %ax ## imm = 0x5555 +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vpcmpgtd (%rdi), %zmm2, %k1 {%k1} +; AVX512VLBW-NEXT: movw $85, %ax ; AVX512VLBW-NEXT: kmovd %eax, %k2 -; AVX512VLBW-NEXT: kandw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $14, %k0, %k2 -; AVX512VLBW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512VLBW-NEXT: korw %k2, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrd $15, %k0, %k0 -; AVX512VLBW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512VLBW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512VLBW-NEXT: korw %k0, %k1, %k1 +; AVX512VLBW-NEXT: vpcmpgtd 64(%rdi), %zmm2, %k2 {%k2} +; AVX512VLBW-NEXT: vmovdqu32 %zmm1, 64(%rdx) {%k2} ; AVX512VLBW-NEXT: vmovdqu32 %zmm0, (%rdx) {%k1} ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; ; X86-AVX512-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: pushl %eax -; X86-AVX512-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; X86-AVX512-NEXT: vpcmpgtd (%eax), %zmm0, %k1 -; X86-AVX512-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill -; X86-AVX512-NEXT: vpcmpgtd 64(%eax), %zmm0, %k0 -; X86-AVX512-NEXT: kunpckwd %k1, %k0, %k0 -; X86-AVX512-NEXT: movl $5592405, %eax ## imm = 0x555555 -; X86-AVX512-NEXT: kmovd %eax, %k2 -; X86-AVX512-NEXT: kandd %k2, %k0, %k0 -; X86-AVX512-NEXT: kshiftrd $21, %k0, %k6 -; X86-AVX512-NEXT: kshiftrd $20, %k0, %k5 -; X86-AVX512-NEXT: kshiftrd $19, %k0, %k4 -; X86-AVX512-NEXT: kshiftrd $18, %k0, %k3 -; X86-AVX512-NEXT: kshiftrd $16, %k0, %k2 -; X86-AVX512-NEXT: kshiftrd $17, %k0, %k7 -; X86-AVX512-NEXT: kshiftlw $15, %k7, %k7 -; X86-AVX512-NEXT: kshiftrw $14, %k7, %k7 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $15, %k2, %k2 -; X86-AVX512-NEXT: korw %k7, %k2, %k7 -; X86-AVX512-NEXT: movw $-5, %ax -; X86-AVX512-NEXT: kmovd %eax, %k1 -; X86-AVX512-NEXT: kmovw %k1, (%esp) ## 2-byte Spill -; X86-AVX512-NEXT: kandw %k1, %k7, %k7 -; X86-AVX512-NEXT: kshiftlw $15, %k3, %k3 -; X86-AVX512-NEXT: kshiftrw $13, %k3, %k3 -; X86-AVX512-NEXT: korw %k3, %k7, %k7 -; X86-AVX512-NEXT: movw $-9, %ax -; X86-AVX512-NEXT: kmovd %eax, %k3 -; X86-AVX512-NEXT: kandw %k3, %k7, %k7 -; X86-AVX512-NEXT: kshiftlw $15, %k4, %k4 -; X86-AVX512-NEXT: kshiftrw $12, %k4, %k4 -; X86-AVX512-NEXT: korw %k4, %k7, %k7 -; X86-AVX512-NEXT: movw $-17, %ax -; X86-AVX512-NEXT: kmovd %eax, %k4 -; X86-AVX512-NEXT: kandw %k4, %k7, %k7 -; X86-AVX512-NEXT: kshiftlw $15, %k5, %k5 -; X86-AVX512-NEXT: kshiftrw $11, %k5, %k5 -; X86-AVX512-NEXT: korw %k5, %k7, %k7 -; X86-AVX512-NEXT: movw $-33, %ax -; X86-AVX512-NEXT: kmovd %eax, %k5 -; X86-AVX512-NEXT: kandw %k5, %k7, %k7 -; X86-AVX512-NEXT: kshiftlw $15, %k6, %k6 -; X86-AVX512-NEXT: kshiftrw $10, %k6, %k6 -; X86-AVX512-NEXT: korw %k6, %k7, %k7 -; X86-AVX512-NEXT: movw $-65, %ax -; X86-AVX512-NEXT: kmovd %eax, %k6 -; X86-AVX512-NEXT: kandw %k6, %k7, %k7 -; X86-AVX512-NEXT: kshiftrd $22, %k0, %k1 -; X86-AVX512-NEXT: kshiftlw $15, %k1, %k1 -; X86-AVX512-NEXT: kshiftrw $9, %k1, %k1 -; X86-AVX512-NEXT: korw %k1, %k7, %k1 -; X86-AVX512-NEXT: movw $-129, %ax -; X86-AVX512-NEXT: kmovd %eax, %k7 -; X86-AVX512-NEXT: kandw %k7, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $23, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $8, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovdqa64 (%ecx), %zmm0 -; X86-AVX512-NEXT: vmovdqa64 64(%ecx), %zmm1 -; X86-AVX512-NEXT: vmovdqu32 %zmm1, 64(%eax) {%k1} -; X86-AVX512-NEXT: kshiftrd $1, %k0, %k1 -; X86-AVX512-NEXT: kshiftlw $15, %k1, %k1 -; X86-AVX512-NEXT: kshiftrw $14, %k1, %k1 -; X86-AVX512-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $15, %k2, %k2 -; X86-AVX512-NEXT: korw %k1, %k2, %k1 -; X86-AVX512-NEXT: kmovw (%esp), %k2 ## 2-byte Reload -; X86-AVX512-NEXT: kandw %k2, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $2, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $13, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: kandw %k3, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $3, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $12, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: kandw %k4, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $4, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $11, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: kandw %k5, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $5, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $10, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: kandw %k6, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $6, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $9, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: kandw %k7, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $7, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $8, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: movw $-257, %cx ## imm = 0xFEFF -; X86-AVX512-NEXT: kmovd %ecx, %k2 -; X86-AVX512-NEXT: kandw %k2, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $8, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $7, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: movw $-513, %cx ## imm = 0xFDFF -; X86-AVX512-NEXT: kmovd %ecx, %k2 -; X86-AVX512-NEXT: kandw %k2, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $9, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $6, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: movw $-1025, %cx ## imm = 0xFBFF -; X86-AVX512-NEXT: kmovd %ecx, %k2 -; X86-AVX512-NEXT: kandw %k2, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $10, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $5, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: movw $-2049, %cx ## imm = 0xF7FF -; X86-AVX512-NEXT: kmovd %ecx, %k2 -; X86-AVX512-NEXT: kandw %k2, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $11, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $4, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: movw $-4097, %cx ## imm = 0xEFFF -; X86-AVX512-NEXT: kmovd %ecx, %k2 -; X86-AVX512-NEXT: kandw %k2, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $12, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $3, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: movw $-8193, %cx ## imm = 0xDFFF -; X86-AVX512-NEXT: kmovd %ecx, %k2 -; X86-AVX512-NEXT: kandw %k2, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $13, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $15, %k2, %k2 -; X86-AVX512-NEXT: kshiftrw $2, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: movw $-16385, %cx ## imm = 0xBFFF -; X86-AVX512-NEXT: kmovd %ecx, %k2 -; X86-AVX512-NEXT: kandw %k2, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $14, %k0, %k2 -; X86-AVX512-NEXT: kshiftlw $14, %k2, %k2 -; X86-AVX512-NEXT: korw %k2, %k1, %k1 -; X86-AVX512-NEXT: kshiftrd $15, %k0, %k0 -; X86-AVX512-NEXT: kshiftlw $1, %k1, %k1 -; X86-AVX512-NEXT: kshiftrw $1, %k1, %k1 -; X86-AVX512-NEXT: kshiftlw $15, %k0, %k0 -; X86-AVX512-NEXT: korw %k0, %k1, %k1 +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX512-NEXT: vmovdqa64 (%edx), %zmm0 +; X86-AVX512-NEXT: vmovdqa64 64(%edx), %zmm1 +; X86-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-AVX512-NEXT: movw $21845, %dx ## imm = 0x5555 +; X86-AVX512-NEXT: kmovd %edx, %k1 +; X86-AVX512-NEXT: vpcmpgtd (%ecx), %zmm2, %k1 {%k1} +; X86-AVX512-NEXT: movw $85, %dx +; X86-AVX512-NEXT: kmovd %edx, %k2 +; X86-AVX512-NEXT: vpcmpgtd 64(%ecx), %zmm2, %k2 {%k2} +; X86-AVX512-NEXT: vmovdqu32 %zmm1, 64(%eax) {%k2} ; X86-AVX512-NEXT: vmovdqu32 %zmm0, (%eax) {%k1} -; X86-AVX512-NEXT: popl %eax ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl %trigger = load <24 x i32>, ptr %trigger.ptr diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll --- a/llvm/test/CodeGen/X86/pr45563-2.ll +++ b/llvm/test/CodeGen/X86/pr45563-2.ll @@ -20,33 +20,31 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: vmovd %esi, %xmm2 -; CHECK-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrw $3, %r8d, %xmm2, %xmm2 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrw $4, %r9d, %xmm2, %xmm2 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm2 +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm2 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; CHECK-NEXT: vmaskmovps (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm2 -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 -; CHECK-NEXT: vmovss %xmm1, 32(%rax) -; CHECK-NEXT: vmovaps %ymm0, (%rax) +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2 +; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u] +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm1, %ymm2 +; CHECK-NEXT: vmovaps %ymm0, (%rdi) +; CHECK-NEXT: vblendvps %xmm1, %xmm2, %xmm4, %xmm0 +; CHECK-NEXT: vmovss %xmm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <9 x float> @llvm.masked.load.v9f32.p0(ptr %addr, i32 4, <9 x i1>%mask, <9 x float> %dst) @@ -63,53 +61,47 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],mem[0] -; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: vmovd %esi, %xmm3 -; CHECK-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrw $3, %r8d, %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrw $4, %r9d, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; CHECK-NEXT: vmaskmovps (%rdi), %ymm3, %ymm4 -; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm2, %ymm2 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm5 -; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm5, %ymm5 -; CHECK-NEXT: vblendvps %xmm4, %xmm5, %xmm1, %xmm1 -; CHECK-NEXT: vmovaps %xmm1, 32(%rax) -; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm1 -; CHECK-NEXT: vblendvps %xmm3, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovss %xmm0, 48(%rax) -; CHECK-NEXT: vmovaps %ymm2, (%rax) +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm2 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm4 +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],mem[0],xmm5[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],mem[0] +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2 +; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 +; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm3, %ymm3 +; CHECK-NEXT: vmovaps %ymm0, (%rdi) +; CHECK-NEXT: vblendvps %xmm1, %xmm3, %xmm5, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, 32(%rdi) +; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0 +; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm6, %xmm0 +; CHECK-NEXT: vmovss %xmm0, 48(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <13 x float> @llvm.masked.load.v13f32.p0(ptr %addr, i32 4, <13 x i1>%mask, <13 x float> %dst) @@ -126,56 +118,49 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: vmovd %esi, %xmm3 -; CHECK-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrw $3, %r8d, %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vpinsrw $4, %r9d, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; CHECK-NEXT: vmaskmovps (%rdi), %ymm3, %ymm4 -; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm2, %ymm2 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vmovd %ecx, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $1, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $4, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm5 -; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm5, %ymm5 -; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm6 -; CHECK-NEXT: vblendvps %xmm3, %xmm6, %xmm1, %xmm1 -; CHECK-NEXT: vmovlps %xmm1, 48(%rax) -; CHECK-NEXT: vblendvps %xmm4, %xmm5, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, 32(%rax) -; CHECK-NEXT: vmovaps %ymm2, (%rax) +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm2 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0] +; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3] +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2 +; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm3[u],zero,xmm3[u] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 +; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm3, %ymm3 +; CHECK-NEXT: vmovaps %ymm0, (%rdi) +; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0 +; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovlps %xmm0, 48(%rdi) +; CHECK-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <14 x float> @llvm.masked.load.v14f32.p0(ptr %addr, i32 4, <14 x i1>%mask, <14 x float> %dst) diff --git a/llvm/test/CodeGen/X86/pr45833.ll b/llvm/test/CodeGen/X86/pr45833.ll --- a/llvm/test/CodeGen/X86/pr45833.ll +++ b/llvm/test/CodeGen/X86/pr45833.ll @@ -20,26 +20,24 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vmovd %eax, %xmm2 -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi) -; CHECK-NEXT: vmovd %esi, %xmm1 -; CHECK-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrw $3, %r8d, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrw $4, %r9d, %xmm1, %xmm1 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vmovd %esi, %xmm2 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4 +; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,u,u,u],zero,xmm4[u,u,u],zero,xmm4[u,u,u],zero,xmm4[u,u,u] +; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 +; CHECK-NEXT: vmaskmovps %ymm1, %ymm4, 32(%rdi) +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -63,39 +61,33 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vmovd %eax, %xmm2 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vmovd %esi, %xmm2 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm5 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi) -; CHECK-NEXT: vmovd %esi, %xmm1 -; CHECK-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrw $3, %r8d, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 +; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrw $4, %r9d, %xmm1, %xmm1 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v13f32.p0(<13 x float> %value, ptr %addr, i32 4, <13 x i1>%mask) @@ -119,41 +111,34 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vmovd %eax, %xmm2 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vmovd %esi, %xmm2 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi) -; CHECK-NEXT: vmovd %esi, %xmm1 -; CHECK-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrw $3, %r8d, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 +; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm4[u],zero,xmm4[u] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrw $4, %r9d, %xmm1, %xmm1 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v14f32.p0(<14 x float> %value, ptr %addr, i32 4, <14 x i1>%mask) @@ -228,15 +213,15 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] @@ -246,7 +231,6 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 @@ -261,33 +245,35 @@ ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; CHECK-NEXT: vmaskmovps %ymm2, %ymm3, 32(%rdi) -; CHECK-NEXT: vmovd %eax, %xmm2 -; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmovd %esi, %xmm2 +; CHECK-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, %r8d, %xmm2, %xmm2 ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $8, %r9d, %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 64(%rdi) -; CHECK-NEXT: vmovd %esi, %xmm1 -; CHECK-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, (%rdi) +; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, 64(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v23f32.p0(<23 x float> %value, ptr %addr, i32 4, <23 x i1>%mask)