Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -490,6 +490,9 @@ /// The Scalars are vectorized into this value. It is initialized to Null. Value *VectorizedValue; + /// A vector of scalars in order uses in program. + ValueList InOdrScalars; + /// Do we need to gather this sequence ? bool NeedToGather; @@ -499,13 +502,17 @@ /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, - bool NeedToShuffle) { + bool NeedToShuffle, + ArrayRef InOdrVL = NULL) { VectorizableTree.emplace_back(); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; Last->NeedToShuffle = NeedToShuffle; + if (NeedToShuffle) + Last->InOdrScalars.insert(Last->InOdrScalars.begin(), InOdrVL.begin(), + InOdrVL.end()); if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); @@ -1262,7 +1269,7 @@ } } if (ShuffledLoads) { - newTreeEntry(NewVL, true, true); + newTreeEntry(NewVL, true, true, VL); return; } } @@ -2630,6 +2637,8 @@ Value *Undef = UndefValue::get(VecTy); Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef, ConstantVector::get(Mask)); + E->VectorizedValue = Shuf; + ++NumVectorInstructions; return Shuf; } @@ -2814,7 +2823,7 @@ } Builder.SetInsertPoint(&F->getEntryBlock().front()); - auto *VectorRoot = vectorizeTree(ArrayRef(), &VectorizableTree[0]); + auto *VectorRoot = vectorizeTree(VectorizableTree[0].InOdrScalars, &VectorizableTree[0]); // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We Index: test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -slp-vectorizer | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define <4 x i32> @zot() #0 { +; CHECK-LABEL: @zot( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 0 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 1 +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P0]] to <4 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0 +; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x i8> undef, i8 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1 +; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x i8> [[I0]], i8 [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2 +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x i8> [[I1]], i8 [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x i8> [[I2]], i8 [[TMP6]], i32 3 +; CHECK-NEXT: [[RET:%.*]] = zext <4 x i8> [[I3]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[RET]] +; +bb: + %p0 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 0 + %p1 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 1 + %p2 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 2 + %p3 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64 3 + %v3 = load i8, i8* %p3, align 1 + %v2 = load i8, i8* %p2, align 1 + %v0 = load i8, i8* %p0, align 1 + %v1 = load i8, i8* %p1, align 1 + %i0 = insertelement <4 x i8> undef, i8 %v1, i32 0 + %i1 = insertelement <4 x i8> %i0, i8 %v0, i32 1 + %i2 = insertelement <4 x i8> %i1, i8 %v2, i32 2 + %i3 = insertelement <4 x i8> %i2, i8 %v3, i32 3 + %ret = zext <4 x i8> %i3 to <4 x i32> + ret <4 x i32> %ret +} +