Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14226,9 +14226,13 @@ } // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT -// operations. If the types of the vectors we're extracting from allow it, -// turn this into a truncate node. +// operations which be matched for a truncate. SDValue DAGCombiner::reduceBuildVecToTrunc(SDNode *N) { + // TODO: Add support for big-endian. + if (DAG.getDataLayout().isBigEndian()) + return SDValue(); + if (N->getNumOperands() < 2) + return SDValue(); SDLoc DL(N); EVT VT = N->getValueType(0); unsigned NumElems = N->getNumOperands(); @@ -14238,11 +14242,12 @@ // If the input is something other than an EXTRACT_VECTOR_ELT with a constant // index, bail out. - if (std::find_if_not(N->op_begin(), N->op_end(), [VT](SDValue Op) { - return Op.isUndef() || (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa(Op.getOperand(1)) && - Op.getValueType() == VT.getVectorElementType()); - }) != N->op_end()) + // TODO: Allow undef elements in some cases? + if (any_of(make_range(N->op_begin(), N->op_end()), [VT](SDValue Op) { + return Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(Op.getOperand(1)) || + Op.getValueType() != VT.getVectorElementType(); + })) return SDValue(); // Helper for obtaining an EXTRACT_VECTOR_ELT's index operand @@ -14250,42 +14255,28 @@ return cast(Extract.getOperand(1))->getSExtValue(); }; - // The first BUILD_VECTOR operand may be an undef or an extract from index - // zero. - if (!N->getOperand(0).isUndef() && GetExtractIdx(N->getOperand(0)) != 0) + // The first BUILD_VECTOR operand must be an an extract from index zero + // (assuming no undef and little-endian). + if (GetExtractIdx(N->getOperand(0)) != 0) return SDValue(); - // Skip over undef operands and bail out if all are undef's. - unsigned i = 1; - for (; i != NumElems && N->getOperand(i).isUndef(); ++i) - ; - if (i == NumElems) - return SDValue(); - - // Compute the stride from the first non-undef element's index. - int Stride = GetExtractIdx(N->getOperand(i)) / i; - SDValue ExtractedFromVec = N->getOperand(i).getOperand(0); + // Compute the stride from the first index. + int Stride = GetExtractIdx(N->getOperand(1)); + SDValue ExtractedFromVec = N->getOperand(1).getOperand(0); // Proceed only if the stride and the types can be matched to a truncate. - if (Stride == 1 || !isPowerOf2_32(Stride)) - return SDValue(); - if (ExtractedFromVec.getValueType().getVectorNumElements() != - Stride * NumElems) - return SDValue(); - if (VT.getScalarSizeInBits() * Stride > 64) + if ((Stride == 1 || !isPowerOf2_32(Stride)) || + (ExtractedFromVec.getValueType().getVectorNumElements() != + Stride * NumElems) || + (VT.getScalarSizeInBits() * Stride > 64)) return SDValue(); - // Check remaining operands are consistent with the computed stride - for (; i != NumElems; ++i) { + // Check remaining operands are consistent with the computed stride. + for (unsigned i = 1; i != NumElems; ++i) { SDValue Op = N->getOperand(i); - if (Op.isUndef()) - continue; - - if (Op.getOperand(0) != ExtractedFromVec) - return SDValue(); - - if (GetExtractIdx(Op) != Stride * i) + if ((Op.getOperand(0) != ExtractedFromVec) || + (GetExtractIdx(Op) != Stride * i)) return SDValue(); } @@ -14295,11 +14286,11 @@ NumElems); EVT TruncVT = VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT; - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, - DAG.getBitcast(NewVT, ExtractedFromVec)); + SDValue Res = DAG.getBitcast(NewVT, ExtractedFromVec); + Res = DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, Res); if (VT.isFloatingPoint()) - Trunc = DAG.getBitcast(VT, Trunc); - return Trunc; + Res = DAG.getBitcast(VT, Res); + return Res; } SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { Index: test/CodeGen/ARM/vext.ll =================================================================== --- test/CodeGen/ARM/vext.ll +++ test/CodeGen/ARM/vext.ll @@ -199,10 +199,10 @@ define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; CHECK-LABEL: test_undef: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d16, [r1] -; CHECK-NEXT: vldr d17, [r0, #8] +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0, #8] ; CHECK-NEXT: vzip.16 d17, d16 -; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r0, r1, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B Index: test/CodeGen/ARM/vpadd.ll =================================================================== --- test/CodeGen/ARM/vpadd.ll +++ test/CodeGen/ARM/vpadd.ll @@ -548,7 +548,19 @@ ; And <2 x i8> to <2 x i32> define <2 x i8> @fromExtendingExtractVectorElt_2i8(<8 x i8> %in) { ; CHECK-LABEL: fromExtendingExtractVectorElt_2i8: -; CHECK: vadd.i32 +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.u8 r2, d16[1] +; CHECK-NEXT: vmov.u8 r3, d16[0] +; CHECK-NEXT: vmov.u8 r0, d16[3] +; CHECK-NEXT: vmov.u8 r1, d16[2] +; CHECK-NEXT: vmov.32 d16[0], r3 +; CHECK-NEXT: vmov.32 d17[0], r2 +; CHECK-NEXT: vmov.32 d16[1], r1 +; CHECK-NEXT: vmov.32 d17[1], r0 +; CHECK-NEXT: vadd.i32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> %x = add <2 x i8> %tmp2, %tmp1 @@ -557,7 +569,20 @@ define <2 x i16> @fromExtendingExtractVectorElt_2i16(<8 x i16> %in) { ; CHECK-LABEL: fromExtendingExtractVectorElt_2i16: -; CHECK: vadd.i32 +; CHECK: @ BB#0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.u16 r2, d16[1] +; CHECK-NEXT: vmov.u16 r3, d16[0] +; CHECK-NEXT: vmov.u16 r0, d16[3] +; CHECK-NEXT: vmov.u16 r1, d16[2] +; CHECK-NEXT: vmov.32 d16[0], r3 +; CHECK-NEXT: vmov.32 d17[0], r2 +; CHECK-NEXT: vmov.32 d16[1], r1 +; CHECK-NEXT: vmov.32 d17[1], r0 +; CHECK-NEXT: vadd.i32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> %tmp2 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> %x = add <2 x i16> %tmp2, %tmp1 Index: test/CodeGen/X86/shuffle-vs-trunc-256.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -50,6 +50,7 @@ ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> @@ -111,7 +112,7 @@ ; AVX-LABEL: shuffle_v16i16_to_v8i16: ; AVX: # BB#0: ; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX-NEXT: vmovdqa %xmm0, (%rsi) ; AVX-NEXT: vzeroupper @@ -129,6 +130,7 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: @@ -143,6 +145,7 @@ ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -210,12 +213,14 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_to_v4i32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i32_to_v4i32: @@ -223,12 +228,14 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %L %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -285,7 +292,7 @@ ; AVX-LABEL: shuffle_v32i8_to_v8i8: ; AVX: # BB#0: ; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovq %xmm0, (%rsi) @@ -305,6 +312,7 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: @@ -320,6 +328,7 @@ ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> @@ -399,6 +408,7 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: @@ -414,6 +424,7 @@ ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -492,6 +503,7 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: @@ -507,6 +519,7 @@ ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> Index: test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -35,12 +35,14 @@ ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> @@ -96,6 +98,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> @@ -122,6 +125,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %L %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> @@ -148,6 +152,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> @@ -174,6 +179,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> @@ -200,6 +206,7 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32>