diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -627,6 +627,7 @@ BS->clear(); } MinBWs.clear(); + InstrElementSize.clear(); } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -5635,53 +5636,58 @@ // that feed it. The type of the loaded value may indicate a more suitable // width than V's type. We want to base the vector element size on the width // of memory operations where possible. - SmallVector Worklist; + SmallVector, 16> Worklist; SmallPtrSet Visited; if (auto *I = dyn_cast(V)) { - Worklist.push_back(I); + Worklist.emplace_back(I, I->getParent()); Visited.insert(I); } // Traverse the expression tree in bottom-up order looking for loads. If we // encounter an instruction we don't yet handle, we give up. - auto MaxWidth = 0u; - auto FoundUnknownInst = false; - while (!Worklist.empty() && !FoundUnknownInst) { - auto *I = Worklist.pop_back_val(); + auto Width = 0u; + while (!Worklist.empty()) { + Instruction *I; + BasicBlock *Parent; + std::tie(I, Parent) = Worklist.pop_back_val(); // We should only be looking at scalar instructions here. If the current - // instruction has a vector type, give up. + // instruction has a vector type, skip. auto *Ty = I->getType(); if (isa(Ty)) - FoundUnknownInst = true; + continue; // If the current instruction is a load, update MaxWidth to reflect the // width of the loaded value. - else if (isa(I)) - MaxWidth = std::max(MaxWidth, DL->getTypeSizeInBits(Ty)); + if (isa(I) || isa(I) || + isa(I)) + Width = std::max(Width, DL->getTypeSizeInBits(Ty)); // Otherwise, we need to visit the operands of the instruction. We only // handle the interesting cases from buildTree here. If an operand is an - // instruction we haven't yet visited, we add it to the worklist. + // instruction we haven't yet visited and from the same basic block as the + // user or the use is a PHI node, we add it to the worklist. else if (isa(I) || isa(I) || isa(I) || - isa(I) || isa(I) || isa(I)) { + isa(I) || isa(I) || isa(I) || + isa(I)) { for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) - if (Visited.insert(J).second) - Worklist.push_back(J); + if (Visited.insert(J).second && + (isa(I) || J->getParent() == Parent)) + Worklist.emplace_back(J, J->getParent()); + } else { + break; } - - // If we don't yet handle the instruction, give up. - else - FoundUnknownInst = true; } - int Width = MaxWidth; // If we didn't encounter a memory access in the expression tree, or if we // gave up for some reason, just return the width of V. Otherwise, return the // maximum width we found. - if (!MaxWidth || FoundUnknownInst) + if (!Width) { + if (auto *CI = dyn_cast(V)) + V = CI->getOperand(0); Width = DL->getTypeSizeInBits(V->getType()); + } for (Instruction *I : Visited) InstrElementSize[I] = Width; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll @@ -61,25 +61,23 @@ ; CHECK-NEXT: [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> ; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32> ; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]] -; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0 -; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64 -; CHECK-NEXT: [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]] -; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[A0]] +; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[C0:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0 +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[LOAD0:%.*]] = load i64, i64* [[GEP0]], align 4 -; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64 -; CHECK-NEXT: [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A1]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]] ; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4 -; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64 -; CHECK-NEXT: [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A2]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP8]] ; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]], align 4 -; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3 -; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64 -; CHECK-NEXT: [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]] -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A3]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP9]] ; CHECK-NEXT: [[LOAD3:%.*]] = load i64, i64* [[GEP3]], align 4 ; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]]) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll @@ -5,25 +5,17 @@ ; CHECK-LABEL: @inst_size( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[VAL:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0 -; CHECK-NEXT: [[TMPL1:%.*]] = load i64, i64* [[A:%.*]], align 4 -; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1 -; CHECK-NEXT: [[TMPL2:%.*]] = load i64, i64* [[PTR2]], align 4 +; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2 -; CHECK-NEXT: [[TMPL3:%.*]] = load i64, i64* [[PTR3]], align 4 ; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 3 -; CHECK-NEXT: [[TMPL4:%.*]] = load i64, i64* [[PTR4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[A]] to <4 x i64>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 4 ; CHECK-NEXT: [[T41:%.*]] = icmp sgt i64 0, [[VAL]] -; CHECK-NEXT: [[T42:%.*]] = icmp sgt i64 0, [[TMPL1]] -; CHECK-NEXT: [[T43:%.*]] = icmp sgt i64 0, [[TMPL2]] -; CHECK-NEXT: [[T44:%.*]] = icmp sgt i64 0, [[TMPL3]] -; CHECK-NEXT: [[T45:%.*]] = icmp sgt i64 0, [[TMPL4]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]] ; CHECK-NEXT: br label [[BLOCK:%.*]] ; CHECK: block: ; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[T41]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[PHI2:%.*]] = phi i1 [ [[T42]], [[ENTRY]] ] -; CHECK-NEXT: [[PHI3:%.*]] = phi i1 [ [[T43]], [[ENTRY]] ] -; CHECK-NEXT: [[PHI4:%.*]] = phi i1 [ [[T44]], [[ENTRY]] ] -; CHECK-NEXT: [[PHI5:%.*]] = phi i1 [ [[T45]], [[ENTRY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ] ; CHECK-NEXT: ret void ; entry: