Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -25289,7 +25289,7 @@ /// instructions, and replace them with the slightly more expensive SSSE3 /// PSHUFB instruction if available. We do this as the last combining step /// to ensure we avoid using PSHUFB if we can implement the shuffle with -/// a suitable short sequence of other instructions. The PHUFB will either +/// a suitable short sequence of other instructions. The PSHUFB will either /// use a register or have to read from memory and so is slightly (but only /// slightly) more expensive than the other shuffle instructions. /// @@ -25302,7 +25302,8 @@ /// would simplify under the threshold for PSHUFB formation because of /// combine-ordering. To fix this, we should do the redundant instruction /// combining in this recursive walk. -static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, +static bool combineX86ShufflesRecursively(ArrayRef SrcOps, + int SrcOpIndex, SDValue Root, ArrayRef RootMask, int Depth, bool HasVariableMask, SelectionDAG &DAG, @@ -25314,6 +25315,7 @@ return false; // Directly rip through bitcasts to find the underlying operand. + SDValue Op = SrcOps[SrcOpIndex]; Op = peekThroughOneUseBitcasts(Op); MVT VT = Op.getSimpleValueType(); @@ -25331,6 +25333,27 @@ if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask)) return false; + // Add the inputs to the Ops list, avoiding duplicates. + SmallVector Ops(SrcOps.begin(), SrcOps.end()); + + int InputIdx0 = -1, InputIdx1 = -1; + for (int i = 0, e = Ops.size(); i < e; ++i) { + SDValue BC = peekThroughBitcasts(Ops[i]); + if (Input0 && BC == peekThroughBitcasts(Input0)) + InputIdx0 = i; + if (Input1 && BC == peekThroughBitcasts(Input1)) + InputIdx1 = i; + } + + if (Input0 && InputIdx0 < 0) { + InputIdx0 = SrcOpIndex; + Ops[SrcOpIndex] = Input0; + } + if (Input1 && InputIdx1 < 0) { + InputIdx1 = Ops.size(); + Ops.push_back(Input1); + } + assert(VT.getVectorNumElements() == OpMask.size() && "Different mask size from vector size!"); assert(((RootMask.size() > OpMask.size() && @@ -25362,6 +25385,17 @@ } int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; + + // Just insert the scaled root mask value if it references an input other + // than the SrcOp we're currently inserting. + if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || + (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { + Mask.push_back(RootMaskedIdx); + continue; + } + + RootMaskedIdx %= MaskWidth; + int OpIdx = RootMaskedIdx / OpRatio; if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we @@ -25370,9 +25404,19 @@ continue; } - // Ok, we have non-zero lanes, map them through. - Mask.push_back(OpMask[OpIdx] * OpRatio + - RootMaskedIdx % OpRatio); + // Ok, we have non-zero lanes, map them through to one of the Op's inputs. + int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio; + OpMaskedIdx %= MaskWidth; + + if (OpMask[OpIdx] < (int)OpMask.size()) { + assert(0 <= InputIdx0 && "Unknown target shuffle input"); + OpMaskedIdx += InputIdx0 * MaskWidth; + } else { + assert(0 <= InputIdx1 && "Unknown target shuffle input"); + OpMaskedIdx += InputIdx1 * MaskWidth; + } + + Mask.push_back(OpMaskedIdx); } // Handle the all undef/zero cases early. @@ -25389,29 +25433,35 @@ return true; } - int MaskSize = Mask.size(); - bool UseInput0 = std::any_of(Mask.begin(), Mask.end(), - [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; }); - bool UseInput1 = std::any_of(Mask.begin(), Mask.end(), - [MaskSize](int Idx) { return MaskSize <= Idx; }); - - // At the moment we can only combine unary shuffle mask cases. - if (UseInput0 && UseInput1) - return false; - else if (UseInput1) { - std::swap(Input0, Input1); - ShuffleVectorSDNode::commuteMask(Mask); + // Remove unused shuffle source ops. + SmallVector UsedOps; + for (int i = 0, e = Ops.size(); i < e; ++i) { + int lo = UsedOps.size() * MaskWidth; + int hi = lo + MaskWidth; + if (std::any_of(Mask.begin(), Mask.end(), + [lo, hi](int i) { return (lo <= i) && (i < hi); })) { + UsedOps.push_back(Ops[i]); + continue; + } + for (int &M : Mask) + if (lo <= M) + M -= MaskWidth; } - - assert(Input0 && "Shuffle with no inputs detected"); + assert(!UsedOps.empty() && "Shuffle with no inputs detected"); + Ops = UsedOps; HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); - // See if we can recurse into Input0 (if it's a target shuffle). - if (Op->isOnlyUserOf(Input0.getNode()) && - combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1, - HasVariableMask, DAG, DCI, Subtarget)) - return true; + // See if we can recurse into each shuffle source op (if it's a target shuffle). + for (int i = 0, e = Ops.size(); i < e; ++i) + if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode())) + if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1, + HasVariableMask, DAG, DCI, Subtarget)) + return true; + + // At the moment we can only combine unary shuffle mask cases. + if (Ops.size() != 1) + return false; // Minor canonicalization of the accumulated shuffle mask to make it easier // to match below. All this does is detect masks with sequential pairs of @@ -25423,7 +25473,7 @@ Mask = std::move(WidenedMask); } - return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG, + return combineX86ShuffleChain(Ops[0], Root, Mask, Depth, HasVariableMask, DAG, DCI, Subtarget); } @@ -26099,8 +26149,8 @@ return LD; if (isTargetShuffle(N->getOpcode())) { - if (SDValue Shuffle = - combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget)) + SDValue Op(N, 0); + if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget)) return Shuffle; // Try recursively combining arbitrary sequences of x86 shuffle @@ -26110,7 +26160,7 @@ // a particular chain. SmallVector NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. Index: llvm/trunk/test/CodeGen/X86/avx512-build-vector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-build-vector.ll +++ llvm/trunk/test/CodeGen/X86/avx512-build-vector.ll @@ -14,13 +14,10 @@ define <16 x float> @test3(<4 x float> %a) { ; CHECK-LABEL: test3: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11,0,1,2,3],zero,zero,zero,zero +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %b = extractelement <4 x float> %a, i32 2 Index: llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -10,16 +10,12 @@ define <2 x i64> @_clearupper2xi64a(<2 x i64>) nounwind { ; SSE-LABEL: _clearupper2xi64a: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: _clearupper2xi64a: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <2 x i64> %0, i32 0 %x1 = extractelement <2 x i64> %0, i32 1 @@ -35,13 +31,7 @@ define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind { ; SSE-LABEL: _clearupper4xi32a: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: _clearupper4xi32a: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -247,8 +247,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -268,8 +267,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -289,8 +287,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -310,8 +307,7 @@ ; ; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -480,8 +480,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -502,8 +501,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -524,8 +522,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -546,8 +543,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -568,8 +564,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -590,8 +585,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -612,8 +606,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -636,8 +629,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] ; AVX2-NEXT: movl $15, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1812,9 +1812,6 @@ ; ; AVX-LABEL: combine_test23: ; AVX: # BB#0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX-NEXT: vmovups %xmm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq