diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2880,6 +2880,14 @@ /// Return true if the target has a vector blend instruction. virtual bool hasVectorBlend() const { return false; } + /// Return true if the target can efficiently produce a VecVT-typed vector of + /// Elt elements, which may be either a scalar, or a vector itself, + /// to replace a chain of scalar stores of the Elt with that vector. + virtual bool shouldVectorizeScalarElementSplattingStores(SDValue Elt, + EVT VecVT) const { + return false; + } + /// Get the maximum supported factor for interleaved memory accesses. /// Default to be the minimum interleave factor: 2. virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -662,7 +662,7 @@ }; // Classify the origin of a stored value. - enum class StoreSource { Unknown, Constant, Extract, Load }; + enum class StoreSource { Unknown, Splat, Constant, Extract, Load }; StoreSource getStoreSource(SDValue StoreVal) { switch (StoreVal.getOpcode()) { case ISD::Constant: @@ -674,7 +674,7 @@ case ISD::LOAD: return StoreSource::Load; default: - return StoreSource::Unknown; + return StoreSource::Splat; } } @@ -741,9 +741,9 @@ /// This is a helper function for mergeConsecutiveStores. It is used for /// store chains that are composed entirely of constant values. - bool tryStoreMergeOfConstants(SmallVectorImpl &StoreNodes, - unsigned NumConsecutiveStores, - EVT MemVT, SDNode *Root, bool AllowVectors); + bool tryStoreMergeOfConstantsOrEltSplat( + SmallVectorImpl &StoreNodes, unsigned NumConsecutiveStores, + EVT MemVT, SDNode *Root, bool IsConstantSrc, bool AllowVectors); /// This is a helper function for mergeConsecutiveStores. It is used for /// store chains that are composed entirely of extracted vector elements. @@ -18734,6 +18734,10 @@ OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR) return false; break; + case StoreSource::Splat: + if (OtherBC != Val) + return false; + break; default: llvm_unreachable("Unhandled store source for merging"); } @@ -18916,16 +18920,16 @@ } } -bool DAGCombiner::tryStoreMergeOfConstants( +bool DAGCombiner::tryStoreMergeOfConstantsOrEltSplat( SmallVectorImpl &StoreNodes, unsigned NumConsecutiveStores, - EVT MemVT, SDNode *RootNode, bool AllowVectors) { + EVT MemVT, SDNode *RootNode, bool IsConstantSrc, bool AllowVectors) { LLVMContext &Context = *DAG.getContext(); const DataLayout &DL = DAG.getDataLayout(); int64_t ElementSizeBytes = MemVT.getStoreSize(); unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; bool MadeChange = false; - // Store the constants into memory as one consecutive store. + // Store the constants/same element into memory as one consecutive store. while (NumConsecutiveStores >= 2) { LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); @@ -18938,16 +18942,18 @@ for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *ST = cast(StoreNodes[i].MemNode); SDValue StoredVal = ST->getValue(); - bool IsElementZero = false; - if (ConstantSDNode *C = dyn_cast(StoredVal)) - IsElementZero = C->isZero(); - else if (ConstantFPSDNode *C = dyn_cast(StoredVal)) - IsElementZero = C->getConstantFPValue()->isNullValue(); - if (IsElementZero) { - if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores) - FirstZeroAfterNonZero = i; + if (IsConstantSrc) { + bool IsElementZero = false; + if (ConstantSDNode *C = dyn_cast(StoredVal)) + IsElementZero = C->isZero(); + else if (ConstantFPSDNode *C = dyn_cast(StoredVal)) + IsElementZero = C->getConstantFPValue()->isNullValue(); + if (IsElementZero) { + if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores) + FirstZeroAfterNonZero = i; + } + NonZero |= !IsElementZero; } - NonZero |= !IsElementZero; // Find a legal type for the constant store. unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; @@ -18984,10 +18990,10 @@ // We only use vectors if the constant is known to be zero or the // target allows it and the function is not marked with the - // noimplicitfloat attribute. - if ((!NonZero || - TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && - AllowVectors) { + // noimplicitfloat attribute, or we are merging a splatting stores. + if (!IsConstantSrc || ((!NonZero || TLI.storeOfVectorConstantIsCheap( + MemVT, i + 1, FirstStoreAS)) && + AllowVectors)) { // Find a legal type for the vector store. unsigned Elts = (i + 1) * NumMemElts; EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); @@ -19000,7 +19006,8 @@ } } - bool UseVector = (LastLegalVectorType > LastLegalType) && AllowVectors; + bool UseVector = !IsConstantSrc || + ((LastLegalVectorType > LastLegalType) && AllowVectors); unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; bool UseTrunc = LastIntegerTrunc && !UseVector; @@ -19033,9 +19040,23 @@ continue; } - MadeChange |= mergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, - /*IsConstantSrc*/ true, - UseVector, UseTrunc); + // If we are producing a splat-store, we want a profitability check. + if (!IsConstantSrc) { + // Get the type for the merged vector store. + unsigned Elts = NumElem * NumMemElts; + EVT StoreTy = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); + // Can we efficiently produce a vector of the value we are splatting? + if (!TLI.shouldVectorizeScalarElementSplattingStores( + cast(StoreNodes[0].MemNode)->getValue(), StoreTy)) { + StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); + NumConsecutiveStores -= NumElem; + continue; + } + } + + MadeChange |= mergeStoresOfConstantsOrVecElts( + StoreNodes, MemVT, NumElem, IsConstantSrc, UseVector, UseTrunc); // Remove merged stores for next iteration. StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); @@ -19419,8 +19440,7 @@ // extracted vector elements. SDValue StoredVal = peekThroughBitcasts(St->getValue()); const StoreSource StoreSrc = getStoreSource(StoredVal); - if (StoreSrc == StoreSource::Unknown) - return false; + assert(StoreSrc != StoreSource::Unknown && "Expected known source for store"); SmallVector StoreNodes; SDNode *RootNode; @@ -19462,8 +19482,10 @@ assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores"); switch (StoreSrc) { case StoreSource::Constant: - MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores, - MemVT, RootNode, AllowVectors); + case StoreSource::Splat: + MadeChange |= tryStoreMergeOfConstantsOrEltSplat( + StoreNodes, NumConsecutiveStores, MemVT, RootNode, + /*IsConstantSrc=*/StoreSrc == StoreSource::Constant, AllowVectors); break; case StoreSource::Extract: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1516,6 +1516,9 @@ bool hasVectorBlend() const override { return true; } + bool shouldVectorizeScalarElementSplattingStores(SDValue Elt, + EVT VecVT) const override; + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } bool isInlineAsmTargetBranch(const SmallVectorImpl &AsmStrs, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57757,3 +57757,8 @@ return Align(1ULL << ExperimentalPrefInnermostLoopAlignment); return TargetLowering::getPrefLoopAlignment(); } + +bool X86TargetLowering::shouldVectorizeScalarElementSplattingStores( + SDValue Elt, EVT VecVT) const { + return Subtarget.hasSSE2() && VecVT.getSizeInBits() >= 128; +} diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll --- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -772,8 +772,8 @@ ; CHECK-LABEL: merge_vec_stores_of_constants: ; CHECK: # %bb.0: ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, 48(%rdi) -; CHECK-NEXT: vmovaps %xmm0, 64(%rdi) +; CHECK-NEXT: vmovups %ymm0, 48(%rdi) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3 %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4 diff --git a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll --- a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll +++ b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll @@ -161,27 +161,86 @@ } define void @vec128_i8(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec128_i8: -; ALL: # %bb.0: -; ALL-NEXT: movzbl (%rdi), %eax -; ALL-NEXT: notb %al -; ALL-NEXT: movb %al, (%rsi) -; ALL-NEXT: movb %al, 1(%rsi) -; ALL-NEXT: movb %al, 2(%rsi) -; ALL-NEXT: movb %al, 3(%rsi) -; ALL-NEXT: movb %al, 4(%rsi) -; ALL-NEXT: movb %al, 5(%rsi) -; ALL-NEXT: movb %al, 6(%rsi) -; ALL-NEXT: movb %al, 7(%rsi) -; ALL-NEXT: movb %al, 8(%rsi) -; ALL-NEXT: movb %al, 9(%rsi) -; ALL-NEXT: movb %al, 10(%rsi) -; ALL-NEXT: movb %al, 11(%rsi) -; ALL-NEXT: movb %al, 12(%rsi) -; ALL-NEXT: movb %al, 13(%rsi) -; ALL-NEXT: movb %al, 14(%rsi) -; ALL-NEXT: movb %al, 15(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec128_i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movzbl (%rdi), %eax +; SCALAR-NEXT: notb %al +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movb %al, 2(%rsi) +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %al, 4(%rsi) +; SCALAR-NEXT: movb %al, 5(%rsi) +; SCALAR-NEXT: movb %al, 6(%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: movb %al, 9(%rsi) +; SCALAR-NEXT: movb %al, 10(%rsi) +; SCALAR-NEXT: movb %al, 11(%rsi) +; SCALAR-NEXT: movb %al, 12(%rsi) +; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movb %al, 14(%rsi) +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec128_i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: notb %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vec128_i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzbl (%rdi), %eax +; SSSE3-NEXT: notb %al +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, (%rsi) +; SSSE3-NEXT: retq +; +; AVX1-LABEL: vec128_i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: notb %al +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_i8: +; AVX2: # %bb.0: +; AVX2-NEXT: movzbl (%rdi), %eax +; AVX2-NEXT: notb %al +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: notb %al +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: notb %al +; AVX512BW-NEXT: vpbroadcastb %eax, %xmm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq %in.elt.not = load i8, ptr %in.elt.ptr, align 64 %in.elt = xor i8 %in.elt.not, -1 %out.elt0.ptr = getelementptr i8, ptr %out.vec.ptr, i64 0 @@ -220,19 +279,65 @@ } define void @vec128_i16(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec128_i16: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movw %ax, (%rsi) -; ALL-NEXT: movw %ax, 2(%rsi) -; ALL-NEXT: movw %ax, 4(%rsi) -; ALL-NEXT: movw %ax, 6(%rsi) -; ALL-NEXT: movw %ax, 8(%rsi) -; ALL-NEXT: movw %ax, 10(%rsi) -; ALL-NEXT: movw %ax, 12(%rsi) -; ALL-NEXT: movw %ax, 14(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec128_i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movw %ax, (%rsi) +; SCALAR-NEXT: movw %ax, 2(%rsi) +; SCALAR-NEXT: movw %ax, 4(%rsi) +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %ax, 8(%rsi) +; SCALAR-NEXT: movw %ax, 10(%rsi) +; SCALAR-NEXT: movw %ax, 12(%rsi) +; SCALAR-NEXT: movw %ax, 14(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec128_i16: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec128_i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_i16: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec128_i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: notl %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec128_i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: notl %eax +; AVX512BW-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: retq %in.elt.not = load i16, ptr %in.elt.ptr, align 64 %in.elt = xor i16 %in.elt.not, -1 %out.elt0.ptr = getelementptr i16, ptr %out.vec.ptr, i64 0 @@ -255,15 +360,50 @@ } define void @vec128_i32(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec128_i32: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movl %eax, (%rsi) -; ALL-NEXT: movl %eax, 4(%rsi) -; ALL-NEXT: movl %eax, 8(%rsi) -; ALL-NEXT: movl %eax, 12(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec128_i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec128_i32: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec128_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec128_i32: +; AVX512: # %bb.0: +; AVX512-NEXT: movl (%rdi), %eax +; AVX512-NEXT: notl %eax +; AVX512-NEXT: vpbroadcastd %eax, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: retq %in.elt.not = load i32, ptr %in.elt.ptr, align 64 %in.elt = xor i32 %in.elt.not, -1 %out.elt0.ptr = getelementptr i32, ptr %out.vec.ptr, i64 0 @@ -278,15 +418,50 @@ } define void @vec128_float(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec128_float: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movl %eax, (%rsi) -; ALL-NEXT: movl %eax, 4(%rsi) -; ALL-NEXT: movl %eax, 8(%rsi) -; ALL-NEXT: movl %eax, 12(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec128_float: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec128_float: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec128_float: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_float: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec128_float: +; AVX512: # %bb.0: +; AVX512-NEXT: movl (%rdi), %eax +; AVX512-NEXT: notl %eax +; AVX512-NEXT: vpbroadcastd %eax, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: retq %in.elt.not = load i32, ptr %in.elt.ptr, align 64 %in.elt.int = xor i32 %in.elt.not, -1 %in.elt = bitcast i32 %in.elt.int to float @@ -302,13 +477,48 @@ } define void @vec128_i64(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec128_i64: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rax, 8(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec128_i64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec128_i64: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec128_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec128_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: notq %rax +; AVX512-NEXT: vpbroadcastq %rax, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: retq %in.elt.not = load i64, ptr %in.elt.ptr, align 64 %in.elt = xor i64 %in.elt.not, -1 %out.elt0.ptr = getelementptr i64, ptr %out.vec.ptr, i64 0 @@ -319,13 +529,48 @@ } define void @vec128_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec128_double: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rax, 8(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec128_double: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec128_double: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec128_double: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_double: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec128_double: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: notq %rax +; AVX512-NEXT: vpbroadcastq %rax, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: retq %in.elt.not = load i64, ptr %in.elt.ptr, align 64 %in.elt.int = xor i64 %in.elt.not, -1 %in.elt = bitcast i64 %in.elt.int to double @@ -337,43 +582,108 @@ } define void @vec256_i8(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec256_i8: -; ALL: # %bb.0: -; ALL-NEXT: movzbl (%rdi), %eax -; ALL-NEXT: notb %al -; ALL-NEXT: movb %al, (%rsi) -; ALL-NEXT: movb %al, 1(%rsi) -; ALL-NEXT: movb %al, 2(%rsi) -; ALL-NEXT: movb %al, 3(%rsi) -; ALL-NEXT: movb %al, 4(%rsi) -; ALL-NEXT: movb %al, 5(%rsi) -; ALL-NEXT: movb %al, 6(%rsi) -; ALL-NEXT: movb %al, 7(%rsi) -; ALL-NEXT: movb %al, 8(%rsi) -; ALL-NEXT: movb %al, 9(%rsi) -; ALL-NEXT: movb %al, 10(%rsi) -; ALL-NEXT: movb %al, 11(%rsi) -; ALL-NEXT: movb %al, 12(%rsi) -; ALL-NEXT: movb %al, 13(%rsi) -; ALL-NEXT: movb %al, 14(%rsi) -; ALL-NEXT: movb %al, 15(%rsi) -; ALL-NEXT: movb %al, 16(%rsi) -; ALL-NEXT: movb %al, 17(%rsi) -; ALL-NEXT: movb %al, 18(%rsi) -; ALL-NEXT: movb %al, 19(%rsi) -; ALL-NEXT: movb %al, 20(%rsi) -; ALL-NEXT: movb %al, 21(%rsi) -; ALL-NEXT: movb %al, 22(%rsi) -; ALL-NEXT: movb %al, 23(%rsi) -; ALL-NEXT: movb %al, 24(%rsi) -; ALL-NEXT: movb %al, 25(%rsi) -; ALL-NEXT: movb %al, 26(%rsi) -; ALL-NEXT: movb %al, 27(%rsi) -; ALL-NEXT: movb %al, 28(%rsi) -; ALL-NEXT: movb %al, 29(%rsi) -; ALL-NEXT: movb %al, 30(%rsi) -; ALL-NEXT: movb %al, 31(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec256_i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movzbl (%rdi), %eax +; SCALAR-NEXT: notb %al +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movb %al, 2(%rsi) +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %al, 4(%rsi) +; SCALAR-NEXT: movb %al, 5(%rsi) +; SCALAR-NEXT: movb %al, 6(%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: movb %al, 9(%rsi) +; SCALAR-NEXT: movb %al, 10(%rsi) +; SCALAR-NEXT: movb %al, 11(%rsi) +; SCALAR-NEXT: movb %al, 12(%rsi) +; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movb %al, 14(%rsi) +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %al, 16(%rsi) +; SCALAR-NEXT: movb %al, 17(%rsi) +; SCALAR-NEXT: movb %al, 18(%rsi) +; SCALAR-NEXT: movb %al, 19(%rsi) +; SCALAR-NEXT: movb %al, 20(%rsi) +; SCALAR-NEXT: movb %al, 21(%rsi) +; SCALAR-NEXT: movb %al, 22(%rsi) +; SCALAR-NEXT: movb %al, 23(%rsi) +; SCALAR-NEXT: movb %al, 24(%rsi) +; SCALAR-NEXT: movb %al, 25(%rsi) +; SCALAR-NEXT: movb %al, 26(%rsi) +; SCALAR-NEXT: movb %al, 27(%rsi) +; SCALAR-NEXT: movb %al, 28(%rsi) +; SCALAR-NEXT: movb %al, 29(%rsi) +; SCALAR-NEXT: movb %al, 30(%rsi) +; SCALAR-NEXT: movb %al, 31(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec256_i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: notb %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vec256_i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzbl (%rdi), %eax +; SSSE3-NEXT: notb %al +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, (%rsi) +; SSSE3-NEXT: movdqa %xmm0, 16(%rsi) +; SSSE3-NEXT: retq +; +; AVX1-LABEL: vec256_i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: notb %al +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_i8: +; AVX2: # %bb.0: +; AVX2-NEXT: movzbl (%rdi), %eax +; AVX2-NEXT: notb %al +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: notb %al +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: notb %al +; AVX512BW-NEXT: vpbroadcastb %eax, %ymm0 +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.elt.not = load i8, ptr %in.elt.ptr, align 64 %in.elt = xor i8 %in.elt.not, -1 %out.elt0.ptr = getelementptr i8, ptr %out.vec.ptr, i64 0 @@ -444,27 +754,79 @@ } define void @vec256_i16(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec256_i16: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movw %ax, (%rsi) -; ALL-NEXT: movw %ax, 2(%rsi) -; ALL-NEXT: movw %ax, 4(%rsi) -; ALL-NEXT: movw %ax, 6(%rsi) -; ALL-NEXT: movw %ax, 8(%rsi) -; ALL-NEXT: movw %ax, 10(%rsi) -; ALL-NEXT: movw %ax, 12(%rsi) -; ALL-NEXT: movw %ax, 14(%rsi) -; ALL-NEXT: movw %ax, 16(%rsi) -; ALL-NEXT: movw %ax, 18(%rsi) -; ALL-NEXT: movw %ax, 20(%rsi) -; ALL-NEXT: movw %ax, 22(%rsi) -; ALL-NEXT: movw %ax, 24(%rsi) -; ALL-NEXT: movw %ax, 26(%rsi) -; ALL-NEXT: movw %ax, 28(%rsi) -; ALL-NEXT: movw %ax, 30(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec256_i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movw %ax, (%rsi) +; SCALAR-NEXT: movw %ax, 2(%rsi) +; SCALAR-NEXT: movw %ax, 4(%rsi) +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %ax, 8(%rsi) +; SCALAR-NEXT: movw %ax, 10(%rsi) +; SCALAR-NEXT: movw %ax, 12(%rsi) +; SCALAR-NEXT: movw %ax, 14(%rsi) +; SCALAR-NEXT: movw %ax, 16(%rsi) +; SCALAR-NEXT: movw %ax, 18(%rsi) +; SCALAR-NEXT: movw %ax, 20(%rsi) +; SCALAR-NEXT: movw %ax, 22(%rsi) +; SCALAR-NEXT: movw %ax, 24(%rsi) +; SCALAR-NEXT: movw %ax, 26(%rsi) +; SCALAR-NEXT: movw %ax, 28(%rsi) +; SCALAR-NEXT: movw %ax, 30(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec256_i16: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec256_i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_i16: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec256_i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: notl %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec256_i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: notl %eax +; AVX512BW-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.elt.not = load i16, ptr %in.elt.ptr, align 64 %in.elt = xor i16 %in.elt.not, -1 %out.elt0.ptr = getelementptr i16, ptr %out.vec.ptr, i64 0 @@ -503,19 +865,58 @@ } define void @vec256_i32(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec256_i32: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movl %eax, (%rsi) -; ALL-NEXT: movl %eax, 4(%rsi) -; ALL-NEXT: movl %eax, 8(%rsi) -; ALL-NEXT: movl %eax, 12(%rsi) -; ALL-NEXT: movl %eax, 16(%rsi) -; ALL-NEXT: movl %eax, 20(%rsi) -; ALL-NEXT: movl %eax, 24(%rsi) -; ALL-NEXT: movl %eax, 28(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec256_i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %eax, 16(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %eax, 24(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec256_i32: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec256_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec256_i32: +; AVX512: # %bb.0: +; AVX512-NEXT: movl (%rdi), %eax +; AVX512-NEXT: notl %eax +; AVX512-NEXT: vpbroadcastd %eax, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i32, ptr %in.elt.ptr, align 64 %in.elt = xor i32 %in.elt.not, -1 %out.elt0.ptr = getelementptr i32, ptr %out.vec.ptr, i64 0 @@ -538,19 +939,58 @@ } define void @vec256_float(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec256_float: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movl %eax, (%rsi) -; ALL-NEXT: movl %eax, 4(%rsi) -; ALL-NEXT: movl %eax, 8(%rsi) -; ALL-NEXT: movl %eax, 12(%rsi) -; ALL-NEXT: movl %eax, 16(%rsi) -; ALL-NEXT: movl %eax, 20(%rsi) -; ALL-NEXT: movl %eax, 24(%rsi) -; ALL-NEXT: movl %eax, 28(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec256_float: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %eax, 16(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %eax, 24(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec256_float: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec256_float: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_float: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec256_float: +; AVX512: # %bb.0: +; AVX512-NEXT: movl (%rdi), %eax +; AVX512-NEXT: notl %eax +; AVX512-NEXT: vpbroadcastd %eax, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i32, ptr %in.elt.ptr, align 64 %in.elt.int = xor i32 %in.elt.not, -1 %in.elt = bitcast i32 %in.elt.int to float @@ -574,15 +1014,55 @@ } define void @vec256_i64(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec256_i64: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rax, 8(%rsi) -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rax, 24(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec256_i64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec256_i64: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec256_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec256_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: notq %rax +; AVX512-NEXT: vpbroadcastq %rax, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i64, ptr %in.elt.ptr, align 64 %in.elt = xor i64 %in.elt.not, -1 %out.elt0.ptr = getelementptr i64, ptr %out.vec.ptr, i64 0 @@ -597,15 +1077,55 @@ } define void @vec256_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec256_double: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rax, 8(%rsi) -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rax, 24(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec256_double: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec256_double: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec256_double: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec256_double: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec256_double: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: notq %rax +; AVX512-NEXT: vpbroadcastq %rax, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i64, ptr %in.elt.ptr, align 64 %in.elt.int = xor i64 %in.elt.not, -1 %in.elt = bitcast i64 %in.elt.int to double @@ -642,59 +1162,130 @@ } define void @vec384_i8(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec384_i8: -; ALL: # %bb.0: -; ALL-NEXT: movzbl (%rdi), %eax -; ALL-NEXT: notb %al -; ALL-NEXT: movb %al, (%rsi) -; ALL-NEXT: movb %al, 1(%rsi) -; ALL-NEXT: movb %al, 2(%rsi) -; ALL-NEXT: movb %al, 3(%rsi) -; ALL-NEXT: movb %al, 4(%rsi) -; ALL-NEXT: movb %al, 5(%rsi) -; ALL-NEXT: movb %al, 6(%rsi) -; ALL-NEXT: movb %al, 7(%rsi) -; ALL-NEXT: movb %al, 8(%rsi) -; ALL-NEXT: movb %al, 9(%rsi) -; ALL-NEXT: movb %al, 10(%rsi) -; ALL-NEXT: movb %al, 11(%rsi) -; ALL-NEXT: movb %al, 12(%rsi) -; ALL-NEXT: movb %al, 13(%rsi) -; ALL-NEXT: movb %al, 14(%rsi) -; ALL-NEXT: movb %al, 15(%rsi) -; ALL-NEXT: movb %al, 16(%rsi) -; ALL-NEXT: movb %al, 17(%rsi) -; ALL-NEXT: movb %al, 18(%rsi) -; ALL-NEXT: movb %al, 19(%rsi) -; ALL-NEXT: movb %al, 20(%rsi) -; ALL-NEXT: movb %al, 21(%rsi) -; ALL-NEXT: movb %al, 22(%rsi) -; ALL-NEXT: movb %al, 23(%rsi) -; ALL-NEXT: movb %al, 24(%rsi) -; ALL-NEXT: movb %al, 25(%rsi) -; ALL-NEXT: movb %al, 26(%rsi) -; ALL-NEXT: movb %al, 27(%rsi) -; ALL-NEXT: movb %al, 28(%rsi) -; ALL-NEXT: movb %al, 29(%rsi) -; ALL-NEXT: movb %al, 30(%rsi) -; ALL-NEXT: movb %al, 31(%rsi) -; ALL-NEXT: movb %al, 32(%rsi) -; ALL-NEXT: movb %al, 33(%rsi) -; ALL-NEXT: movb %al, 34(%rsi) -; ALL-NEXT: movb %al, 35(%rsi) -; ALL-NEXT: movb %al, 36(%rsi) -; ALL-NEXT: movb %al, 37(%rsi) -; ALL-NEXT: movb %al, 38(%rsi) -; ALL-NEXT: movb %al, 39(%rsi) -; ALL-NEXT: movb %al, 40(%rsi) -; ALL-NEXT: movb %al, 41(%rsi) -; ALL-NEXT: movb %al, 42(%rsi) -; ALL-NEXT: movb %al, 43(%rsi) -; ALL-NEXT: movb %al, 44(%rsi) -; ALL-NEXT: movb %al, 45(%rsi) -; ALL-NEXT: movb %al, 46(%rsi) -; ALL-NEXT: movb %al, 47(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec384_i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movzbl (%rdi), %eax +; SCALAR-NEXT: notb %al +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movb %al, 2(%rsi) +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %al, 4(%rsi) +; SCALAR-NEXT: movb %al, 5(%rsi) +; SCALAR-NEXT: movb %al, 6(%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: movb %al, 9(%rsi) +; SCALAR-NEXT: movb %al, 10(%rsi) +; SCALAR-NEXT: movb %al, 11(%rsi) +; SCALAR-NEXT: movb %al, 12(%rsi) +; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movb %al, 14(%rsi) +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %al, 16(%rsi) +; SCALAR-NEXT: movb %al, 17(%rsi) +; SCALAR-NEXT: movb %al, 18(%rsi) +; SCALAR-NEXT: movb %al, 19(%rsi) +; SCALAR-NEXT: movb %al, 20(%rsi) +; SCALAR-NEXT: movb %al, 21(%rsi) +; SCALAR-NEXT: movb %al, 22(%rsi) +; SCALAR-NEXT: movb %al, 23(%rsi) +; SCALAR-NEXT: movb %al, 24(%rsi) +; SCALAR-NEXT: movb %al, 25(%rsi) +; SCALAR-NEXT: movb %al, 26(%rsi) +; SCALAR-NEXT: movb %al, 27(%rsi) +; SCALAR-NEXT: movb %al, 28(%rsi) +; SCALAR-NEXT: movb %al, 29(%rsi) +; SCALAR-NEXT: movb %al, 30(%rsi) +; SCALAR-NEXT: movb %al, 31(%rsi) +; SCALAR-NEXT: movb %al, 32(%rsi) +; SCALAR-NEXT: movb %al, 33(%rsi) +; SCALAR-NEXT: movb %al, 34(%rsi) +; SCALAR-NEXT: movb %al, 35(%rsi) +; SCALAR-NEXT: movb %al, 36(%rsi) +; SCALAR-NEXT: movb %al, 37(%rsi) +; SCALAR-NEXT: movb %al, 38(%rsi) +; SCALAR-NEXT: movb %al, 39(%rsi) +; SCALAR-NEXT: movb %al, 40(%rsi) +; SCALAR-NEXT: movb %al, 41(%rsi) +; SCALAR-NEXT: movb %al, 42(%rsi) +; SCALAR-NEXT: movb %al, 43(%rsi) +; SCALAR-NEXT: movb %al, 44(%rsi) +; SCALAR-NEXT: movb %al, 45(%rsi) +; SCALAR-NEXT: movb %al, 46(%rsi) +; SCALAR-NEXT: movb %al, 47(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec384_i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: notb %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vec384_i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzbl (%rdi), %eax +; SSSE3-NEXT: notb %al +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, (%rsi) +; SSSE3-NEXT: movdqa %xmm0, 16(%rsi) +; SSSE3-NEXT: movdqa %xmm0, 32(%rsi) +; SSSE3-NEXT: retq +; +; AVX1-LABEL: vec384_i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: notb %al +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_i8: +; AVX2: # %bb.0: +; AVX2-NEXT: movzbl (%rdi), %eax +; AVX2-NEXT: notb %al +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: notb %al +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: notb %al +; AVX512BW-NEXT: vpbroadcastb %eax, %ymm0 +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.elt.not = load i8, ptr %in.elt.ptr, align 64 %in.elt = xor i8 %in.elt.not, -1 %out.elt0.ptr = getelementptr i8, ptr %out.vec.ptr, i64 0 @@ -797,35 +1388,92 @@ } define void @vec384_i16(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec384_i16: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movw %ax, (%rsi) -; ALL-NEXT: movw %ax, 2(%rsi) -; ALL-NEXT: movw %ax, 4(%rsi) -; ALL-NEXT: movw %ax, 6(%rsi) -; ALL-NEXT: movw %ax, 8(%rsi) -; ALL-NEXT: movw %ax, 10(%rsi) -; ALL-NEXT: movw %ax, 12(%rsi) -; ALL-NEXT: movw %ax, 14(%rsi) -; ALL-NEXT: movw %ax, 16(%rsi) -; ALL-NEXT: movw %ax, 18(%rsi) -; ALL-NEXT: movw %ax, 20(%rsi) -; ALL-NEXT: movw %ax, 22(%rsi) -; ALL-NEXT: movw %ax, 24(%rsi) -; ALL-NEXT: movw %ax, 26(%rsi) -; ALL-NEXT: movw %ax, 28(%rsi) -; ALL-NEXT: movw %ax, 30(%rsi) -; ALL-NEXT: movw %ax, 32(%rsi) -; ALL-NEXT: movw %ax, 34(%rsi) -; ALL-NEXT: movw %ax, 36(%rsi) -; ALL-NEXT: movw %ax, 38(%rsi) -; ALL-NEXT: movw %ax, 40(%rsi) -; ALL-NEXT: movw %ax, 42(%rsi) -; ALL-NEXT: movw %ax, 44(%rsi) -; ALL-NEXT: movw %ax, 46(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec384_i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movw %ax, (%rsi) +; SCALAR-NEXT: movw %ax, 2(%rsi) +; SCALAR-NEXT: movw %ax, 4(%rsi) +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %ax, 8(%rsi) +; SCALAR-NEXT: movw %ax, 10(%rsi) +; SCALAR-NEXT: movw %ax, 12(%rsi) +; SCALAR-NEXT: movw %ax, 14(%rsi) +; SCALAR-NEXT: movw %ax, 16(%rsi) +; SCALAR-NEXT: movw %ax, 18(%rsi) +; SCALAR-NEXT: movw %ax, 20(%rsi) +; SCALAR-NEXT: movw %ax, 22(%rsi) +; SCALAR-NEXT: movw %ax, 24(%rsi) +; SCALAR-NEXT: movw %ax, 26(%rsi) +; SCALAR-NEXT: movw %ax, 28(%rsi) +; SCALAR-NEXT: movw %ax, 30(%rsi) +; SCALAR-NEXT: movw %ax, 32(%rsi) +; SCALAR-NEXT: movw %ax, 34(%rsi) +; SCALAR-NEXT: movw %ax, 36(%rsi) +; SCALAR-NEXT: movw %ax, 38(%rsi) +; SCALAR-NEXT: movw %ax, 40(%rsi) +; SCALAR-NEXT: movw %ax, 42(%rsi) +; SCALAR-NEXT: movw %ax, 44(%rsi) +; SCALAR-NEXT: movw %ax, 46(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec384_i16: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec384_i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_i16: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec384_i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: notl %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec384_i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: notl %eax +; AVX512BW-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.elt.not = load i16, ptr %in.elt.ptr, align 64 %in.elt = xor i16 %in.elt.not, -1 %out.elt0.ptr = getelementptr i16, ptr %out.vec.ptr, i64 0 @@ -880,23 +1528,66 @@ } define void @vec384_i32(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec384_i32: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movl %eax, (%rsi) -; ALL-NEXT: movl %eax, 4(%rsi) -; ALL-NEXT: movl %eax, 8(%rsi) -; ALL-NEXT: movl %eax, 12(%rsi) -; ALL-NEXT: movl %eax, 16(%rsi) -; ALL-NEXT: movl %eax, 20(%rsi) -; ALL-NEXT: movl %eax, 24(%rsi) -; ALL-NEXT: movl %eax, 28(%rsi) -; ALL-NEXT: movl %eax, 32(%rsi) -; ALL-NEXT: movl %eax, 36(%rsi) -; ALL-NEXT: movl %eax, 40(%rsi) -; ALL-NEXT: movl %eax, 44(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec384_i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %eax, 16(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %eax, 24(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %eax, 32(%rsi) +; SCALAR-NEXT: movl %eax, 36(%rsi) +; SCALAR-NEXT: movl %eax, 40(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec384_i32: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec384_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec384_i32: +; AVX512: # %bb.0: +; AVX512-NEXT: movl (%rdi), %eax +; AVX512-NEXT: notl %eax +; AVX512-NEXT: vpbroadcastd %eax, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i32, ptr %in.elt.ptr, align 64 %in.elt = xor i32 %in.elt.not, -1 %out.elt0.ptr = getelementptr i32, ptr %out.vec.ptr, i64 0 @@ -927,23 +1618,66 @@ } define void @vec384_float(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec384_float: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movl %eax, (%rsi) -; ALL-NEXT: movl %eax, 4(%rsi) -; ALL-NEXT: movl %eax, 8(%rsi) -; ALL-NEXT: movl %eax, 12(%rsi) -; ALL-NEXT: movl %eax, 16(%rsi) -; ALL-NEXT: movl %eax, 20(%rsi) -; ALL-NEXT: movl %eax, 24(%rsi) -; ALL-NEXT: movl %eax, 28(%rsi) -; ALL-NEXT: movl %eax, 32(%rsi) -; ALL-NEXT: movl %eax, 36(%rsi) -; ALL-NEXT: movl %eax, 40(%rsi) -; ALL-NEXT: movl %eax, 44(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec384_float: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %eax, 16(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %eax, 24(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %eax, 32(%rsi) +; SCALAR-NEXT: movl %eax, 36(%rsi) +; SCALAR-NEXT: movl %eax, 40(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec384_float: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec384_float: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_float: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec384_float: +; AVX512: # %bb.0: +; AVX512-NEXT: movl (%rdi), %eax +; AVX512-NEXT: notl %eax +; AVX512-NEXT: vpbroadcastd %eax, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i32, ptr %in.elt.ptr, align 64 %in.elt.int = xor i32 %in.elt.not, -1 %in.elt = bitcast i32 %in.elt.int to float @@ -975,17 +1709,61 @@ } define void @vec384_i64(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec384_i64: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rax, 8(%rsi) -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rax, 24(%rsi) -; ALL-NEXT: movq %rax, 32(%rsi) -; ALL-NEXT: movq %rax, 40(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec384_i64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: movq %rax, 40(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec384_i64: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec384_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec384_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: notq %rax +; AVX512-NEXT: vpbroadcastq %rax, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i64, ptr %in.elt.ptr, align 64 %in.elt = xor i64 %in.elt.not, -1 %out.elt0.ptr = getelementptr i64, ptr %out.vec.ptr, i64 0 @@ -1004,17 +1782,61 @@ } define void @vec384_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec384_double: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rax, 8(%rsi) -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rax, 24(%rsi) -; ALL-NEXT: movq %rax, 32(%rsi) -; ALL-NEXT: movq %rax, 40(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec384_double: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: movq %rax, 40(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec384_double: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec384_double: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %xmm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec384_double: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec384_double: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: notq %rax +; AVX512-NEXT: vpbroadcastq %rax, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i64, ptr %in.elt.ptr, align 64 %in.elt.int = xor i64 %in.elt.not, -1 %in.elt = bitcast i64 %in.elt.int to double @@ -1059,75 +1881,148 @@ } define void @vec512_i8(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec512_i8: -; ALL: # %bb.0: -; ALL-NEXT: movzbl (%rdi), %eax -; ALL-NEXT: notb %al -; ALL-NEXT: movb %al, (%rsi) -; ALL-NEXT: movb %al, 1(%rsi) -; ALL-NEXT: movb %al, 2(%rsi) -; ALL-NEXT: movb %al, 3(%rsi) -; ALL-NEXT: movb %al, 4(%rsi) -; ALL-NEXT: movb %al, 5(%rsi) -; ALL-NEXT: movb %al, 6(%rsi) -; ALL-NEXT: movb %al, 7(%rsi) -; ALL-NEXT: movb %al, 8(%rsi) -; ALL-NEXT: movb %al, 9(%rsi) -; ALL-NEXT: movb %al, 10(%rsi) -; ALL-NEXT: movb %al, 11(%rsi) -; ALL-NEXT: movb %al, 12(%rsi) -; ALL-NEXT: movb %al, 13(%rsi) -; ALL-NEXT: movb %al, 14(%rsi) -; ALL-NEXT: movb %al, 15(%rsi) -; ALL-NEXT: movb %al, 16(%rsi) -; ALL-NEXT: movb %al, 17(%rsi) -; ALL-NEXT: movb %al, 18(%rsi) -; ALL-NEXT: movb %al, 19(%rsi) -; ALL-NEXT: movb %al, 20(%rsi) -; ALL-NEXT: movb %al, 21(%rsi) -; ALL-NEXT: movb %al, 22(%rsi) -; ALL-NEXT: movb %al, 23(%rsi) -; ALL-NEXT: movb %al, 24(%rsi) -; ALL-NEXT: movb %al, 25(%rsi) -; ALL-NEXT: movb %al, 26(%rsi) -; ALL-NEXT: movb %al, 27(%rsi) -; ALL-NEXT: movb %al, 28(%rsi) -; ALL-NEXT: movb %al, 29(%rsi) -; ALL-NEXT: movb %al, 30(%rsi) -; ALL-NEXT: movb %al, 31(%rsi) -; ALL-NEXT: movb %al, 32(%rsi) -; ALL-NEXT: movb %al, 33(%rsi) -; ALL-NEXT: movb %al, 34(%rsi) -; ALL-NEXT: movb %al, 35(%rsi) -; ALL-NEXT: movb %al, 36(%rsi) -; ALL-NEXT: movb %al, 37(%rsi) -; ALL-NEXT: movb %al, 38(%rsi) -; ALL-NEXT: movb %al, 39(%rsi) -; ALL-NEXT: movb %al, 40(%rsi) -; ALL-NEXT: movb %al, 41(%rsi) -; ALL-NEXT: movb %al, 42(%rsi) -; ALL-NEXT: movb %al, 43(%rsi) -; ALL-NEXT: movb %al, 44(%rsi) -; ALL-NEXT: movb %al, 45(%rsi) -; ALL-NEXT: movb %al, 46(%rsi) -; ALL-NEXT: movb %al, 47(%rsi) -; ALL-NEXT: movb %al, 48(%rsi) -; ALL-NEXT: movb %al, 49(%rsi) -; ALL-NEXT: movb %al, 50(%rsi) -; ALL-NEXT: movb %al, 51(%rsi) -; ALL-NEXT: movb %al, 52(%rsi) -; ALL-NEXT: movb %al, 53(%rsi) -; ALL-NEXT: movb %al, 54(%rsi) -; ALL-NEXT: movb %al, 55(%rsi) -; ALL-NEXT: movb %al, 56(%rsi) -; ALL-NEXT: movb %al, 57(%rsi) -; ALL-NEXT: movb %al, 58(%rsi) -; ALL-NEXT: movb %al, 59(%rsi) -; ALL-NEXT: movb %al, 60(%rsi) -; ALL-NEXT: movb %al, 61(%rsi) -; ALL-NEXT: movb %al, 62(%rsi) -; ALL-NEXT: movb %al, 63(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec512_i8: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movzbl (%rdi), %eax +; SCALAR-NEXT: notb %al +; SCALAR-NEXT: movb %al, (%rsi) +; SCALAR-NEXT: movb %al, 1(%rsi) +; SCALAR-NEXT: movb %al, 2(%rsi) +; SCALAR-NEXT: movb %al, 3(%rsi) +; SCALAR-NEXT: movb %al, 4(%rsi) +; SCALAR-NEXT: movb %al, 5(%rsi) +; SCALAR-NEXT: movb %al, 6(%rsi) +; SCALAR-NEXT: movb %al, 7(%rsi) +; SCALAR-NEXT: movb %al, 8(%rsi) +; SCALAR-NEXT: movb %al, 9(%rsi) +; SCALAR-NEXT: movb %al, 10(%rsi) +; SCALAR-NEXT: movb %al, 11(%rsi) +; SCALAR-NEXT: movb %al, 12(%rsi) +; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movb %al, 14(%rsi) +; SCALAR-NEXT: movb %al, 15(%rsi) +; SCALAR-NEXT: movb %al, 16(%rsi) +; SCALAR-NEXT: movb %al, 17(%rsi) +; SCALAR-NEXT: movb %al, 18(%rsi) +; SCALAR-NEXT: movb %al, 19(%rsi) +; SCALAR-NEXT: movb %al, 20(%rsi) +; SCALAR-NEXT: movb %al, 21(%rsi) +; SCALAR-NEXT: movb %al, 22(%rsi) +; SCALAR-NEXT: movb %al, 23(%rsi) +; SCALAR-NEXT: movb %al, 24(%rsi) +; SCALAR-NEXT: movb %al, 25(%rsi) +; SCALAR-NEXT: movb %al, 26(%rsi) +; SCALAR-NEXT: movb %al, 27(%rsi) +; SCALAR-NEXT: movb %al, 28(%rsi) +; SCALAR-NEXT: movb %al, 29(%rsi) +; SCALAR-NEXT: movb %al, 30(%rsi) +; SCALAR-NEXT: movb %al, 31(%rsi) +; SCALAR-NEXT: movb %al, 32(%rsi) +; SCALAR-NEXT: movb %al, 33(%rsi) +; SCALAR-NEXT: movb %al, 34(%rsi) +; SCALAR-NEXT: movb %al, 35(%rsi) +; SCALAR-NEXT: movb %al, 36(%rsi) +; SCALAR-NEXT: movb %al, 37(%rsi) +; SCALAR-NEXT: movb %al, 38(%rsi) +; SCALAR-NEXT: movb %al, 39(%rsi) +; SCALAR-NEXT: movb %al, 40(%rsi) +; SCALAR-NEXT: movb %al, 41(%rsi) +; SCALAR-NEXT: movb %al, 42(%rsi) +; SCALAR-NEXT: movb %al, 43(%rsi) +; SCALAR-NEXT: movb %al, 44(%rsi) +; SCALAR-NEXT: movb %al, 45(%rsi) +; SCALAR-NEXT: movb %al, 46(%rsi) +; SCALAR-NEXT: movb %al, 47(%rsi) +; SCALAR-NEXT: movb %al, 48(%rsi) +; SCALAR-NEXT: movb %al, 49(%rsi) +; SCALAR-NEXT: movb %al, 50(%rsi) +; SCALAR-NEXT: movb %al, 51(%rsi) +; SCALAR-NEXT: movb %al, 52(%rsi) +; SCALAR-NEXT: movb %al, 53(%rsi) +; SCALAR-NEXT: movb %al, 54(%rsi) +; SCALAR-NEXT: movb %al, 55(%rsi) +; SCALAR-NEXT: movb %al, 56(%rsi) +; SCALAR-NEXT: movb %al, 57(%rsi) +; SCALAR-NEXT: movb %al, 58(%rsi) +; SCALAR-NEXT: movb %al, 59(%rsi) +; SCALAR-NEXT: movb %al, 60(%rsi) +; SCALAR-NEXT: movb %al, 61(%rsi) +; SCALAR-NEXT: movb %al, 62(%rsi) +; SCALAR-NEXT: movb %al, 63(%rsi) +; SCALAR-NEXT: retq +; +; SSE2-LABEL: vec512_i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: notb %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm0, 32(%rsi) +; SSE2-NEXT: movdqa %xmm0, 48(%rsi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vec512_i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movzbl (%rdi), %eax +; SSSE3-NEXT: notb %al +; SSSE3-NEXT: movzbl %al, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, (%rsi) +; SSSE3-NEXT: movdqa %xmm0, 16(%rsi) +; SSSE3-NEXT: movdqa %xmm0, 32(%rsi) +; SSSE3-NEXT: movdqa %xmm0, 48(%rsi) +; SSSE3-NEXT: retq +; +; AVX1-LABEL: vec512_i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: notb %al +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_i8: +; AVX2: # %bb.0: +; AVX2-NEXT: movzbl (%rdi), %eax +; AVX2-NEXT: notb %al +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: notb %al +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: notb %al +; AVX512BW-NEXT: vpbroadcastb %eax, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.elt.not = load i8, ptr %in.elt.ptr, align 64 %in.elt = xor i8 %in.elt.not, -1 %out.elt0.ptr = getelementptr i8, ptr %out.vec.ptr, i64 0 @@ -1262,43 +2157,100 @@ } define void @vec512_i16(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec512_i16: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movw %ax, (%rsi) -; ALL-NEXT: movw %ax, 2(%rsi) -; ALL-NEXT: movw %ax, 4(%rsi) -; ALL-NEXT: movw %ax, 6(%rsi) -; ALL-NEXT: movw %ax, 8(%rsi) -; ALL-NEXT: movw %ax, 10(%rsi) -; ALL-NEXT: movw %ax, 12(%rsi) -; ALL-NEXT: movw %ax, 14(%rsi) -; ALL-NEXT: movw %ax, 16(%rsi) -; ALL-NEXT: movw %ax, 18(%rsi) -; ALL-NEXT: movw %ax, 20(%rsi) -; ALL-NEXT: movw %ax, 22(%rsi) -; ALL-NEXT: movw %ax, 24(%rsi) -; ALL-NEXT: movw %ax, 26(%rsi) -; ALL-NEXT: movw %ax, 28(%rsi) -; ALL-NEXT: movw %ax, 30(%rsi) -; ALL-NEXT: movw %ax, 32(%rsi) -; ALL-NEXT: movw %ax, 34(%rsi) -; ALL-NEXT: movw %ax, 36(%rsi) -; ALL-NEXT: movw %ax, 38(%rsi) -; ALL-NEXT: movw %ax, 40(%rsi) -; ALL-NEXT: movw %ax, 42(%rsi) -; ALL-NEXT: movw %ax, 44(%rsi) -; ALL-NEXT: movw %ax, 46(%rsi) -; ALL-NEXT: movw %ax, 48(%rsi) -; ALL-NEXT: movw %ax, 50(%rsi) -; ALL-NEXT: movw %ax, 52(%rsi) -; ALL-NEXT: movw %ax, 54(%rsi) -; ALL-NEXT: movw %ax, 56(%rsi) -; ALL-NEXT: movw %ax, 58(%rsi) -; ALL-NEXT: movw %ax, 60(%rsi) -; ALL-NEXT: movw %ax, 62(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec512_i16: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movw %ax, (%rsi) +; SCALAR-NEXT: movw %ax, 2(%rsi) +; SCALAR-NEXT: movw %ax, 4(%rsi) +; SCALAR-NEXT: movw %ax, 6(%rsi) +; SCALAR-NEXT: movw %ax, 8(%rsi) +; SCALAR-NEXT: movw %ax, 10(%rsi) +; SCALAR-NEXT: movw %ax, 12(%rsi) +; SCALAR-NEXT: movw %ax, 14(%rsi) +; SCALAR-NEXT: movw %ax, 16(%rsi) +; SCALAR-NEXT: movw %ax, 18(%rsi) +; SCALAR-NEXT: movw %ax, 20(%rsi) +; SCALAR-NEXT: movw %ax, 22(%rsi) +; SCALAR-NEXT: movw %ax, 24(%rsi) +; SCALAR-NEXT: movw %ax, 26(%rsi) +; SCALAR-NEXT: movw %ax, 28(%rsi) +; SCALAR-NEXT: movw %ax, 30(%rsi) +; SCALAR-NEXT: movw %ax, 32(%rsi) +; SCALAR-NEXT: movw %ax, 34(%rsi) +; SCALAR-NEXT: movw %ax, 36(%rsi) +; SCALAR-NEXT: movw %ax, 38(%rsi) +; SCALAR-NEXT: movw %ax, 40(%rsi) +; SCALAR-NEXT: movw %ax, 42(%rsi) +; SCALAR-NEXT: movw %ax, 44(%rsi) +; SCALAR-NEXT: movw %ax, 46(%rsi) +; SCALAR-NEXT: movw %ax, 48(%rsi) +; SCALAR-NEXT: movw %ax, 50(%rsi) +; SCALAR-NEXT: movw %ax, 52(%rsi) +; SCALAR-NEXT: movw %ax, 54(%rsi) +; SCALAR-NEXT: movw %ax, 56(%rsi) +; SCALAR-NEXT: movw %ax, 58(%rsi) +; SCALAR-NEXT: movw %ax, 60(%rsi) +; SCALAR-NEXT: movw %ax, 62(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec512_i16: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec512_i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_i16: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec512_i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: notl %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: notl %eax +; AVX512BW-NEXT: vpbroadcastw %eax, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.elt.not = load i16, ptr %in.elt.ptr, align 64 %in.elt = xor i16 %in.elt.not, -1 %out.elt0.ptr = getelementptr i16, ptr %out.vec.ptr, i64 0 @@ -1369,27 +2321,71 @@ } define void @vec512_i32(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec512_i32: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movl %eax, (%rsi) -; ALL-NEXT: movl %eax, 4(%rsi) -; ALL-NEXT: movl %eax, 8(%rsi) -; ALL-NEXT: movl %eax, 12(%rsi) -; ALL-NEXT: movl %eax, 16(%rsi) -; ALL-NEXT: movl %eax, 20(%rsi) -; ALL-NEXT: movl %eax, 24(%rsi) -; ALL-NEXT: movl %eax, 28(%rsi) -; ALL-NEXT: movl %eax, 32(%rsi) -; ALL-NEXT: movl %eax, 36(%rsi) -; ALL-NEXT: movl %eax, 40(%rsi) -; ALL-NEXT: movl %eax, 44(%rsi) -; ALL-NEXT: movl %eax, 48(%rsi) -; ALL-NEXT: movl %eax, 52(%rsi) -; ALL-NEXT: movl %eax, 56(%rsi) -; ALL-NEXT: movl %eax, 60(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec512_i32: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %eax, 16(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %eax, 24(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %eax, 32(%rsi) +; SCALAR-NEXT: movl %eax, 36(%rsi) +; SCALAR-NEXT: movl %eax, 40(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: movl %eax, 48(%rsi) +; SCALAR-NEXT: movl %eax, 52(%rsi) +; SCALAR-NEXT: movl %eax, 56(%rsi) +; SCALAR-NEXT: movl %eax, 60(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec512_i32: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec512_i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec512_i32: +; AVX512: # %bb.0: +; AVX512-NEXT: movl (%rdi), %eax +; AVX512-NEXT: notl %eax +; AVX512-NEXT: vpbroadcastd %eax, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i32, ptr %in.elt.ptr, align 64 %in.elt = xor i32 %in.elt.not, -1 %out.elt0.ptr = getelementptr i32, ptr %out.vec.ptr, i64 0 @@ -1428,27 +2424,71 @@ } define void @vec512_float(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec512_float: -; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: notl %eax -; ALL-NEXT: movl %eax, (%rsi) -; ALL-NEXT: movl %eax, 4(%rsi) -; ALL-NEXT: movl %eax, 8(%rsi) -; ALL-NEXT: movl %eax, 12(%rsi) -; ALL-NEXT: movl %eax, 16(%rsi) -; ALL-NEXT: movl %eax, 20(%rsi) -; ALL-NEXT: movl %eax, 24(%rsi) -; ALL-NEXT: movl %eax, 28(%rsi) -; ALL-NEXT: movl %eax, 32(%rsi) -; ALL-NEXT: movl %eax, 36(%rsi) -; ALL-NEXT: movl %eax, 40(%rsi) -; ALL-NEXT: movl %eax, 44(%rsi) -; ALL-NEXT: movl %eax, 48(%rsi) -; ALL-NEXT: movl %eax, 52(%rsi) -; ALL-NEXT: movl %eax, 56(%rsi) -; ALL-NEXT: movl %eax, 60(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec512_float: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movl (%rdi), %eax +; SCALAR-NEXT: notl %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movl %eax, 4(%rsi) +; SCALAR-NEXT: movl %eax, 8(%rsi) +; SCALAR-NEXT: movl %eax, 12(%rsi) +; SCALAR-NEXT: movl %eax, 16(%rsi) +; SCALAR-NEXT: movl %eax, 20(%rsi) +; SCALAR-NEXT: movl %eax, 24(%rsi) +; SCALAR-NEXT: movl %eax, 28(%rsi) +; SCALAR-NEXT: movl %eax, 32(%rsi) +; SCALAR-NEXT: movl %eax, 36(%rsi) +; SCALAR-NEXT: movl %eax, 40(%rsi) +; SCALAR-NEXT: movl %eax, 44(%rsi) +; SCALAR-NEXT: movl %eax, 48(%rsi) +; SCALAR-NEXT: movl %eax, 52(%rsi) +; SCALAR-NEXT: movl %eax, 56(%rsi) +; SCALAR-NEXT: movl %eax, 60(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec512_float: +; SSE: # %bb.0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: notl %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec512_float: +; AVX1: # %bb.0: +; AVX1-NEXT: movl (%rdi), %eax +; AVX1-NEXT: notl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_float: +; AVX2: # %bb.0: +; AVX2-NEXT: movl (%rdi), %eax +; AVX2-NEXT: notl %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec512_float: +; AVX512: # %bb.0: +; AVX512-NEXT: movl (%rdi), %eax +; AVX512-NEXT: notl %eax +; AVX512-NEXT: vpbroadcastd %eax, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i32, ptr %in.elt.ptr, align 64 %in.elt.int = xor i32 %in.elt.not, -1 %in.elt = bitcast i32 %in.elt.int to float @@ -1488,19 +2528,63 @@ } define void @vec512_i64(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec512_i64: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rax, 8(%rsi) -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rax, 24(%rsi) -; ALL-NEXT: movq %rax, 32(%rsi) -; ALL-NEXT: movq %rax, 40(%rsi) -; ALL-NEXT: movq %rax, 48(%rsi) -; ALL-NEXT: movq %rax, 56(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec512_i64: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: movq %rax, 40(%rsi) +; SCALAR-NEXT: movq %rax, 48(%rsi) +; SCALAR-NEXT: movq %rax, 56(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec512_i64: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec512_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec512_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: notq %rax +; AVX512-NEXT: vpbroadcastq %rax, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i64, ptr %in.elt.ptr, align 64 %in.elt = xor i64 %in.elt.not, -1 %out.elt0.ptr = getelementptr i64, ptr %out.vec.ptr, i64 0 @@ -1523,19 +2607,63 @@ } define void @vec512_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { -; ALL-LABEL: vec512_double: -; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) -; ALL-NEXT: movq %rax, 8(%rsi) -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rax, 24(%rsi) -; ALL-NEXT: movq %rax, 32(%rsi) -; ALL-NEXT: movq %rax, 40(%rsi) -; ALL-NEXT: movq %rax, 48(%rsi) -; ALL-NEXT: movq %rax, 56(%rsi) -; ALL-NEXT: retq +; SCALAR-LABEL: vec512_double: +; SCALAR: # %bb.0: +; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: notq %rax +; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rax, 8(%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rax, 24(%rsi) +; SCALAR-NEXT: movq %rax, 32(%rsi) +; SCALAR-NEXT: movq %rax, 40(%rsi) +; SCALAR-NEXT: movq %rax, 48(%rsi) +; SCALAR-NEXT: movq %rax, 56(%rsi) +; SCALAR-NEXT: retq +; +; SSE-LABEL: vec512_double: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rsi) +; SSE-NEXT: retq +; +; AVX1-LABEL: vec512_double: +; AVX1: # %bb.0: +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: notq %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec512_double: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: notq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: vec512_double: +; AVX512: # %bb.0: +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: notq %rax +; AVX512-NEXT: vpbroadcastq %rax, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.elt.not = load i64, ptr %in.elt.ptr, align 64 %in.elt.int = xor i64 %in.elt.not, -1 %in.elt = bitcast i64 %in.elt.int to double @@ -1617,17 +2745,8 @@ } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} -; AVX1: {{.*}} -; AVX2: {{.*}} -; AVX512: {{.*}} -; AVX512BW: {{.*}} -; AVX512F: {{.*}} -; SCALAR: {{.*}} -; SSE: {{.*}} -; SSE2: {{.*}} ; SSE2-ONLY: {{.*}} ; SSE3: {{.*}} ; SSE41: {{.*}} ; SSE42: {{.*}} -; SSSE3: {{.*}} ; SSSE3-ONLY: {{.*}} diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll --- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll +++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll @@ -241,16 +241,17 @@ ; X64-NEXT: shrdq $6, %rsi, %r9 ; X64-NEXT: shrdq $6, %rdx, %rsi ; X64-NEXT: shrdq $6, %rcx, %rdx -; X64-NEXT: sarq $63, %r8 ; X64-NEXT: sarq $6, %rcx ; X64-NEXT: movq %rcx, 56(%rdi) ; X64-NEXT: movq %rdx, 48(%rdi) ; X64-NEXT: movq %rsi, 40(%rdi) ; X64-NEXT: movq %r9, 32(%rdi) -; X64-NEXT: movq %r8, 24(%rdi) -; X64-NEXT: movq %r8, 16(%rdi) -; X64-NEXT: movq %r8, 8(%rdi) -; X64-NEXT: movq %r8, (%rdi) +; X64-NEXT: movq %r8, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-NEXT: psrad $31, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: movdqa %xmm0, 16(%rdi) +; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %Amt = insertelement <2 x i256> , i256 255, i32 0 %Out = ashr <2 x i256> %In, %Amt diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -1129,8 +1129,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i64> %in.subvec.not, @@ -1171,8 +1171,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <2 x i64> %in.subvec.not, @@ -1399,8 +1399,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i32> %in.subvec.not, @@ -1436,8 +1436,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <4 x i32> %in.subvec.not, @@ -1631,8 +1631,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i16> %in.subvec.not, @@ -1798,8 +1798,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <16 x i8> %in.subvec.not, @@ -2316,8 +2316,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX-NEXT: retq %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 @@ -2364,8 +2364,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX-NEXT: retq %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 @@ -3738,8 +3738,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX-NEXT: retq %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 @@ -3780,8 +3780,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX-NEXT: retq %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 @@ -4606,8 +4606,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX-NEXT: retq %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64 @@ -5109,8 +5109,8 @@ ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX-NEXT: retq %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64 @@ -5937,16 +5937,38 @@ ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) ; SSE2-NEXT: retq ; -; AVX-LABEL: vec512_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) -; AVX-NEXT: retq +; AVX1-LABEL: vec512_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v2i64: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i64> %in.subvec.not, store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -5991,16 +6013,38 @@ ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) ; SSE2-NEXT: retq ; -; AVX-LABEL: vec512_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) -; AVX-NEXT: retq +; AVX1-LABEL: vec512_v2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v2f64: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <2 x i64> %in.subvec.not, %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double> @@ -6354,16 +6398,38 @@ ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) ; SSE2-NEXT: retq ; -; AVX-LABEL: vec512_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) -; AVX-NEXT: retq +; AVX1-LABEL: vec512_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v4i32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i32> %in.subvec.not, store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -6401,16 +6467,38 @@ ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) ; SSE2-NEXT: retq ; -; AVX-LABEL: vec512_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) -; AVX-NEXT: retq +; AVX1-LABEL: vec512_v4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v4f32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <4 x i32> %in.subvec.not, %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float> @@ -6476,15 +6564,25 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: vec512_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-ONLY-LABEL: vec512_v4i64: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i64> %in.subvec.not, store <4 x i64> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -6545,15 +6643,25 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: vec512_v4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-ONLY-LABEL: vec512_v4f64: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <4 x i64> %in.subvec.not, %in.subvec = bitcast <4 x i64> %in.subvec.int to <4 x double> @@ -6803,16 +6911,49 @@ ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) ; SSE2-NEXT: retq ; -; AVX-LABEL: vec512_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) -; AVX-NEXT: retq +; AVX1-LABEL: vec512_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v8i16: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512F-LABEL: vec512_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i16> %in.subvec.not, store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -6899,15 +7040,25 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: vec512_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-ONLY-LABEL: vec512_v8i32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i32> %in.subvec.not, store <8 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -6990,15 +7141,25 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: vec512_v8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-ONLY-LABEL: vec512_v8f32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: vec512_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <8 x i32> %in.subvec.not, %in.subvec = bitcast <8 x i32> %in.subvec.int to <8 x float> @@ -7209,16 +7370,49 @@ ; SSE2-NEXT: movdqa %xmm0, 48(%rdx) ; SSE2-NEXT: retq ; -; AVX-LABEL: vec512_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) -; AVX-NEXT: retq +; AVX1-LABEL: vec512_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-ONLY-LABEL: vec512_v16i8: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512F-LABEL: vec512_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <16 x i8> %in.subvec.not, store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -7389,15 +7583,35 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: vec512_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-ONLY-LABEL: vec512_v16i16: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512F-LABEL: vec512_v16i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.subvec.not = load <16 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <16 x i16> %in.subvec.not, store <16 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -7704,15 +7918,35 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: vec512_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-ONLY-LABEL: vec512_v32i8: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512F-LABEL: vec512_v32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: vec512_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vpxor (%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.subvec.not = load <32 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <32 x i8> %in.subvec.not, store <32 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -1016,11 +1016,10 @@ ; X32-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X32-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X32-SSE42-NEXT: movl %esi, (%esp) -; X32-SSE42-NEXT: sarl $31, %edx -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-SSE42-NEXT: movd %edx, %xmm0 +; X32-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-SSE42-NEXT: psrad $31, %xmm0 +; X32-SSE42-NEXT: movdqu %xmm0, {{[0-9]+}}(%esp) ; X32-SSE42-NEXT: andl $15, %ecx ; X32-SSE42-NEXT: movups (%esp,%ecx), %xmm0 ; X32-SSE42-NEXT: movups %xmm0, (%eax) @@ -1030,37 +1029,66 @@ ; X32-SSE42-NEXT: popl %ebx ; X32-SSE42-NEXT: retl ; -; X32-AVX-LABEL: ashr_16bytes: -; X32-AVX: # %bb.0: -; X32-AVX-NEXT: pushl %ebx -; X32-AVX-NEXT: pushl %edi -; X32-AVX-NEXT: pushl %esi -; X32-AVX-NEXT: subl $32, %esp -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-AVX-NEXT: movl (%edx), %esi -; X32-AVX-NEXT: movl 4(%edx), %edi -; X32-AVX-NEXT: movl 8(%edx), %ebx -; X32-AVX-NEXT: movl 12(%edx), %edx -; X32-AVX-NEXT: movzbl (%ecx), %ecx -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %esi, (%esp) -; X32-AVX-NEXT: sarl $31, %edx -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: andl $15, %ecx -; X32-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X32-AVX-NEXT: vmovups %xmm0, (%eax) -; X32-AVX-NEXT: addl $32, %esp -; X32-AVX-NEXT: popl %esi -; X32-AVX-NEXT: popl %edi -; X32-AVX-NEXT: popl %ebx -; X32-AVX-NEXT: retl +; X32-AVX1-LABEL: ashr_16bytes: +; X32-AVX1: # %bb.0: +; X32-AVX1-NEXT: pushl %ebx +; X32-AVX1-NEXT: pushl %edi +; X32-AVX1-NEXT: pushl %esi +; X32-AVX1-NEXT: subl $32, %esp +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-AVX1-NEXT: movl (%edx), %esi +; X32-AVX1-NEXT: movl 4(%edx), %edi +; X32-AVX1-NEXT: movl 8(%edx), %ebx +; X32-AVX1-NEXT: movl 12(%edx), %edx +; X32-AVX1-NEXT: movzbl (%ecx), %ecx +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %esi, (%esp) +; X32-AVX1-NEXT: vmovd %edx, %xmm0 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqu %xmm0, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: andl $15, %ecx +; X32-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0 +; X32-AVX1-NEXT: vmovups %xmm0, (%eax) +; X32-AVX1-NEXT: addl $32, %esp +; X32-AVX1-NEXT: popl %esi +; X32-AVX1-NEXT: popl %edi +; X32-AVX1-NEXT: popl %ebx +; X32-AVX1-NEXT: retl +; +; X32-AVX512-LABEL: ashr_16bytes: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: pushl %ebx +; X32-AVX512-NEXT: pushl %edi +; X32-AVX512-NEXT: pushl %esi +; X32-AVX512-NEXT: subl $32, %esp +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-AVX512-NEXT: movl (%edx), %esi +; X32-AVX512-NEXT: movl 4(%edx), %edi +; X32-AVX512-NEXT: movl 8(%edx), %ebx +; X32-AVX512-NEXT: movl 12(%edx), %edx +; X32-AVX512-NEXT: movzbl (%ecx), %ecx +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %esi, (%esp) +; X32-AVX512-NEXT: sarl $31, %edx +; X32-AVX512-NEXT: vpbroadcastd %edx, %xmm0 +; X32-AVX512-NEXT: vmovdqu %xmm0, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: andl $15, %ecx +; X32-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0 +; X32-AVX512-NEXT: vmovups %xmm0, (%eax) +; X32-AVX512-NEXT: addl $32, %esp +; X32-AVX512-NEXT: popl %esi +; X32-AVX512-NEXT: popl %edi +; X32-AVX512-NEXT: popl %ebx +; X32-AVX512-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 %bitOff = shl i128 %byteOff, 3 @@ -1466,38 +1494,60 @@ ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: sarq $63, %rcx -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movq %rcx, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-SSE42-NEXT: pxor %xmm1, %xmm1 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqu %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movdqu %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: andl $31, %esi ; X64-SSE42-NEXT: movups -64(%rsp,%rsi), %xmm0 ; X64-SSE42-NEXT: movups -48(%rsp,%rsi), %xmm1 -; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) +; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: ashr_32bytes: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %xmm0 -; X64-AVX-NEXT: movq 16(%rdi), %rax -; X64-AVX-NEXT: movq 24(%rdi), %rcx -; X64-AVX-NEXT: movzbl (%rsi), %esi -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: sarq $63, %rcx -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: andl $31, %esi -; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %xmm0 -; X64-AVX-NEXT: vmovups -48(%rsp,%rsi), %xmm1 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: ashr_32bytes: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %xmm0 +; X64-AVX1-NEXT: movq 16(%rdi), %rax +; X64-AVX1-NEXT: movq 24(%rdi), %rcx +; X64-AVX1-NEXT: movzbl (%rsi), %esi +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovq %rcx, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: andl $31, %esi +; X64-AVX1-NEXT: vmovups -64(%rsp,%rsi), %xmm0 +; X64-AVX1-NEXT: vmovups -48(%rsp,%rsi), %xmm1 +; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: ashr_32bytes: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovups (%rdi), %xmm0 +; X64-AVX512-NEXT: movq 16(%rdi), %rax +; X64-AVX512-NEXT: movq 24(%rdi), %rcx +; X64-AVX512-NEXT: movzbl (%rsi), %esi +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: sarq $63, %rcx +; X64-AVX512-NEXT: vpbroadcastq %rcx, %ymm0 +; X64-AVX512-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: andl $31, %esi +; X64-AVX512-NEXT: vmovups -64(%rsp,%rsi), %xmm0 +; X64-AVX512-NEXT: vmovups -48(%rsp,%rsi), %xmm1 +; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq ; ; X32-SSE2-LABEL: ashr_32bytes: ; X32-SSE2: # %bb.0: @@ -1587,65 +1637,92 @@ ; X32-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X32-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X32-SSE42-NEXT: movups %xmm0, (%esp) -; X32-SSE42-NEXT: sarl $31, %edx -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-SSE42-NEXT: movd %edx, %xmm0 +; X32-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-SSE42-NEXT: psrad $31, %xmm0 +; X32-SSE42-NEXT: movdqu %xmm0, {{[0-9]+}}(%esp) +; X32-SSE42-NEXT: movdqu %xmm0, {{[0-9]+}}(%esp) ; X32-SSE42-NEXT: andl $31, %ecx ; X32-SSE42-NEXT: movups (%esp,%ecx), %xmm0 ; X32-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1 -; X32-SSE42-NEXT: movups %xmm1, 16(%eax) ; X32-SSE42-NEXT: movups %xmm0, (%eax) +; X32-SSE42-NEXT: movups %xmm1, 16(%eax) ; X32-SSE42-NEXT: addl $64, %esp ; X32-SSE42-NEXT: popl %esi ; X32-SSE42-NEXT: popl %edi ; X32-SSE42-NEXT: popl %ebx ; X32-SSE42-NEXT: retl ; -; X32-AVX-LABEL: ashr_32bytes: -; X32-AVX: # %bb.0: -; X32-AVX-NEXT: pushl %ebx -; X32-AVX-NEXT: pushl %edi -; X32-AVX-NEXT: pushl %esi -; X32-AVX-NEXT: subl $64, %esp -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-AVX-NEXT: vmovups (%edx), %xmm0 -; X32-AVX-NEXT: movl 16(%edx), %esi -; X32-AVX-NEXT: movl 20(%edx), %edi -; X32-AVX-NEXT: movl 24(%edx), %ebx -; X32-AVX-NEXT: movl 28(%edx), %edx -; X32-AVX-NEXT: movzbl (%ecx), %ecx -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovups %xmm0, (%esp) -; X32-AVX-NEXT: sarl $31, %edx -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: andl $31, %ecx -; X32-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X32-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X32-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X32-AVX-NEXT: vmovups %xmm0, (%eax) -; X32-AVX-NEXT: addl $64, %esp -; X32-AVX-NEXT: popl %esi -; X32-AVX-NEXT: popl %edi -; X32-AVX-NEXT: popl %ebx -; X32-AVX-NEXT: retl +; X32-AVX1-LABEL: ashr_32bytes: +; X32-AVX1: # %bb.0: +; X32-AVX1-NEXT: pushl %ebx +; X32-AVX1-NEXT: pushl %edi +; X32-AVX1-NEXT: pushl %esi +; X32-AVX1-NEXT: subl $64, %esp +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-AVX1-NEXT: vmovups (%edx), %xmm0 +; X32-AVX1-NEXT: movl 16(%edx), %esi +; X32-AVX1-NEXT: movl 20(%edx), %edi +; X32-AVX1-NEXT: movl 24(%edx), %ebx +; X32-AVX1-NEXT: movl 28(%edx), %edx +; X32-AVX1-NEXT: movzbl (%ecx), %ecx +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: vmovups %xmm0, (%esp) +; X32-AVX1-NEXT: vmovd %edx, %xmm0 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqu %xmm0, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: vmovdqu %xmm0, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: andl $31, %ecx +; X32-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0 +; X32-AVX1-NEXT: vmovups 16(%esp,%ecx), %xmm1 +; X32-AVX1-NEXT: vmovups %xmm0, (%eax) +; X32-AVX1-NEXT: vmovups %xmm1, 16(%eax) +; X32-AVX1-NEXT: addl $64, %esp +; X32-AVX1-NEXT: popl %esi +; X32-AVX1-NEXT: popl %edi +; X32-AVX1-NEXT: popl %ebx +; X32-AVX1-NEXT: retl +; +; X32-AVX512-LABEL: ashr_32bytes: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: pushl %ebx +; X32-AVX512-NEXT: pushl %edi +; X32-AVX512-NEXT: pushl %esi +; X32-AVX512-NEXT: subl $64, %esp +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-AVX512-NEXT: vmovups (%edx), %xmm0 +; X32-AVX512-NEXT: movl 16(%edx), %esi +; X32-AVX512-NEXT: movl 20(%edx), %edi +; X32-AVX512-NEXT: movl 24(%edx), %ebx +; X32-AVX512-NEXT: movl 28(%edx), %edx +; X32-AVX512-NEXT: movzbl (%ecx), %ecx +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: vmovups %xmm0, (%esp) +; X32-AVX512-NEXT: sarl $31, %edx +; X32-AVX512-NEXT: vpbroadcastd %edx, %ymm0 +; X32-AVX512-NEXT: vmovdqu %ymm0, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: andl $31, %ecx +; X32-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0 +; X32-AVX512-NEXT: vmovups 16(%esp,%ecx), %xmm1 +; X32-AVX512-NEXT: vmovups %xmm0, (%eax) +; X32-AVX512-NEXT: vmovups %xmm1, 16(%eax) +; X32-AVX512-NEXT: addl $64, %esp +; X32-AVX512-NEXT: popl %esi +; X32-AVX512-NEXT: popl %edi +; X32-AVX512-NEXT: popl %ebx +; X32-AVX512-NEXT: vzeroupper +; X32-AVX512-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 %bitOff = shl i256 %byteOff, 3 @@ -2419,7 +2496,7 @@ ; X64-SSE42-LABEL: ashr_64bytes: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: movups (%rdi), %xmm0 -; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE42-NEXT: movdqu 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movups 32(%rdi), %xmm2 ; X64-SSE42-NEXT: movq 48(%rdi), %rax ; X64-SSE42-NEXT: movq 56(%rdi), %rcx @@ -2427,59 +2504,82 @@ ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movdqu %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: sarq $63, %rcx -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movq %rcx, %xmm0 +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-SSE42-NEXT: pxor %xmm1, %xmm1 +; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqu %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movdqu %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movdqu %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movdqu %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: andl $63, %esi ; X64-SSE42-NEXT: movups -128(%rsp,%rsi), %xmm0 ; X64-SSE42-NEXT: movups -112(%rsp,%rsi), %xmm1 ; X64-SSE42-NEXT: movups -96(%rsp,%rsi), %xmm2 ; X64-SSE42-NEXT: movups -80(%rsp,%rsi), %xmm3 -; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) -; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) +; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) +; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: ashr_64bytes: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX-NEXT: movq 48(%rdi), %rax -; X64-AVX-NEXT: movq 56(%rdi), %rcx -; X64-AVX-NEXT: movl (%rsi), %esi -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: sarq $63, %rcx -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: andl $63, %esi -; X64-AVX-NEXT: vmovups -128(%rsp,%rsi), %xmm0 -; X64-AVX-NEXT: vmovups -112(%rsp,%rsi), %xmm1 -; X64-AVX-NEXT: vmovups -96(%rsp,%rsi), %xmm2 -; X64-AVX-NEXT: vmovups -80(%rsp,%rsi), %xmm3 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: ashr_64bytes: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovdqu 32(%rdi), %xmm1 +; X64-AVX1-NEXT: movq 48(%rdi), %rax +; X64-AVX1-NEXT: movq 56(%rdi), %rcx +; X64-AVX1-NEXT: movl (%rsi), %esi +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovdqu %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovq %rcx, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: andl $63, %esi +; X64-AVX1-NEXT: vmovups -128(%rsp,%rsi), %xmm0 +; X64-AVX1-NEXT: vmovups -112(%rsp,%rsi), %xmm1 +; X64-AVX1-NEXT: vmovups -96(%rsp,%rsi), %xmm2 +; X64-AVX1-NEXT: vmovups -80(%rsp,%rsi), %xmm3 +; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) +; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) +; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: ashr_64bytes: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX512-NEXT: vmovups 32(%rdi), %xmm1 +; X64-AVX512-NEXT: movq 48(%rdi), %rax +; X64-AVX512-NEXT: movq 56(%rdi), %rcx +; X64-AVX512-NEXT: movl (%rsi), %esi +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: sarq $63, %rcx +; X64-AVX512-NEXT: vpbroadcastq %rcx, %zmm0 +; X64-AVX512-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: andl $63, %esi +; X64-AVX512-NEXT: vmovups -128(%rsp,%rsi), %xmm0 +; X64-AVX512-NEXT: vmovups -112(%rsp,%rsi), %xmm1 +; X64-AVX512-NEXT: vmovups -96(%rsp,%rsi), %xmm2 +; X64-AVX512-NEXT: vmovups -80(%rsp,%rsi), %xmm3 +; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) +; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) +; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx) +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq ; ; X32-SSE2-LABEL: ashr_64bytes: ; X32-SSE2: # %bb.0: @@ -2645,23 +2745,13 @@ ; X32-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp) ; X32-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp) ; X32-SSE42-NEXT: movups %xmm0, (%esp) -; X32-SSE42-NEXT: sarl $31, %edx -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-SSE42-NEXT: movd %edx, %xmm0 +; X32-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-SSE42-NEXT: psrad $31, %xmm0 +; X32-SSE42-NEXT: movdqu %xmm0, {{[0-9]+}}(%esp) +; X32-SSE42-NEXT: movdqu %xmm0, {{[0-9]+}}(%esp) +; X32-SSE42-NEXT: movdqu %xmm0, {{[0-9]+}}(%esp) +; X32-SSE42-NEXT: movdqu %xmm0, {{[0-9]+}}(%esp) ; X32-SSE42-NEXT: andl $63, %ecx ; X32-SSE42-NEXT: movups (%esp,%ecx), %xmm0 ; X32-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1 @@ -2677,60 +2767,90 @@ ; X32-SSE42-NEXT: popl %ebx ; X32-SSE42-NEXT: retl ; -; X32-AVX-LABEL: ashr_64bytes: -; X32-AVX: # %bb.0: -; X32-AVX-NEXT: pushl %ebx -; X32-AVX-NEXT: pushl %edi -; X32-AVX-NEXT: pushl %esi -; X32-AVX-NEXT: subl $128, %esp -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-AVX-NEXT: vmovups (%edx), %ymm0 -; X32-AVX-NEXT: vmovups 32(%edx), %xmm1 -; X32-AVX-NEXT: movl 48(%edx), %esi -; X32-AVX-NEXT: movl 52(%edx), %edi -; X32-AVX-NEXT: movl 56(%edx), %ebx -; X32-AVX-NEXT: movl 60(%edx), %edx -; X32-AVX-NEXT: movl (%ecx), %ecx -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovups %ymm0, (%esp) -; X32-AVX-NEXT: sarl $31, %edx -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: andl $63, %ecx -; X32-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X32-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X32-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X32-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm3 -; X32-AVX-NEXT: vmovups %xmm3, 48(%eax) -; X32-AVX-NEXT: vmovups %xmm2, 32(%eax) -; X32-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X32-AVX-NEXT: vmovups %xmm0, (%eax) -; X32-AVX-NEXT: addl $128, %esp -; X32-AVX-NEXT: popl %esi -; X32-AVX-NEXT: popl %edi -; X32-AVX-NEXT: popl %ebx -; X32-AVX-NEXT: vzeroupper -; X32-AVX-NEXT: retl +; X32-AVX1-LABEL: ashr_64bytes: +; X32-AVX1: # %bb.0: +; X32-AVX1-NEXT: pushl %ebx +; X32-AVX1-NEXT: pushl %edi +; X32-AVX1-NEXT: pushl %esi +; X32-AVX1-NEXT: subl $128, %esp +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-AVX1-NEXT: vmovups (%edx), %ymm0 +; X32-AVX1-NEXT: vmovups 32(%edx), %xmm1 +; X32-AVX1-NEXT: movl 48(%edx), %esi +; X32-AVX1-NEXT: movl 52(%edx), %edi +; X32-AVX1-NEXT: movl 56(%edx), %ebx +; X32-AVX1-NEXT: movl 60(%edx), %edx +; X32-AVX1-NEXT: movl (%ecx), %ecx +; X32-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: vmovups %ymm0, (%esp) +; X32-AVX1-NEXT: vmovd %edx, %xmm0 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X32-AVX1-NEXT: andl $63, %ecx +; X32-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0 +; X32-AVX1-NEXT: vmovups 16(%esp,%ecx), %xmm1 +; X32-AVX1-NEXT: vmovups 32(%esp,%ecx), %xmm2 +; X32-AVX1-NEXT: vmovups 48(%esp,%ecx), %xmm3 +; X32-AVX1-NEXT: vmovups %xmm3, 48(%eax) +; X32-AVX1-NEXT: vmovups %xmm2, 32(%eax) +; X32-AVX1-NEXT: vmovups %xmm1, 16(%eax) +; X32-AVX1-NEXT: vmovups %xmm0, (%eax) +; X32-AVX1-NEXT: addl $128, %esp +; X32-AVX1-NEXT: popl %esi +; X32-AVX1-NEXT: popl %edi +; X32-AVX1-NEXT: popl %ebx +; X32-AVX1-NEXT: vzeroupper +; X32-AVX1-NEXT: retl +; +; X32-AVX512-LABEL: ashr_64bytes: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: pushl %ebx +; X32-AVX512-NEXT: pushl %edi +; X32-AVX512-NEXT: pushl %esi +; X32-AVX512-NEXT: subl $128, %esp +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-AVX512-NEXT: vmovups (%edx), %ymm0 +; X32-AVX512-NEXT: vmovups 32(%edx), %xmm1 +; X32-AVX512-NEXT: movl 48(%edx), %esi +; X32-AVX512-NEXT: movl 52(%edx), %edi +; X32-AVX512-NEXT: movl 56(%edx), %ebx +; X32-AVX512-NEXT: movl 60(%edx), %edx +; X32-AVX512-NEXT: movl (%ecx), %ecx +; X32-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: vmovups %ymm0, (%esp) +; X32-AVX512-NEXT: sarl $31, %edx +; X32-AVX512-NEXT: vpbroadcastd %edx, %zmm0 +; X32-AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%esp) +; X32-AVX512-NEXT: andl $63, %ecx +; X32-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0 +; X32-AVX512-NEXT: vmovups 16(%esp,%ecx), %xmm1 +; X32-AVX512-NEXT: vmovups 32(%esp,%ecx), %xmm2 +; X32-AVX512-NEXT: vmovups 48(%esp,%ecx), %xmm3 +; X32-AVX512-NEXT: vmovups %xmm3, 48(%eax) +; X32-AVX512-NEXT: vmovups %xmm2, 32(%eax) +; X32-AVX512-NEXT: vmovups %xmm1, 16(%eax) +; X32-AVX512-NEXT: vmovups %xmm0, (%eax) +; X32-AVX512-NEXT: addl $128, %esp +; X32-AVX512-NEXT: popl %esi +; X32-AVX512-NEXT: popl %edi +; X32-AVX512-NEXT: popl %ebx +; X32-AVX512-NEXT: vzeroupper +; X32-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3