diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3593,7 +3593,15 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1); + SmallVector<Type *, 4> ScalarTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) + ScalarTys.push_back(CI->getArgOperand(op)->getType()); + + FastMathFlags FMF; + if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) + FMF = FPMO->getFastMathFlags(); + + IntrinsicCostAttributes CostAttrs(ID, ScalarTy, ScalarTys, FMF, 1); int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; diff --git a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s + +; Regression test for a bug in the SLP vectorizer that was causing +; these rotates to be incorrectly combined into a vector rotate. + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +define void @foo(<2 x i64> %x, <4 x i32> %y, i64* %out) #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0 +; CHECK-NEXT: [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2 +; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[B]] to i64 +; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]]) +; CHECK-NEXT: store i64 [[C]], i64* [[OUT:%.*]], align 8 +; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1 +; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3 +; CHECK-NEXT: [[CONV17:%.*]] = zext i32 [[E]] to i64 +; CHECK-NEXT: [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]]) +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[OUT]], i32 1 +; CHECK-NEXT: store i64 [[F]], i64* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: ret void +; + %a = extractelement <2 x i64> %x, i32 0 + %b = extractelement <4 x i32> %y, i32 2 + %conv6 = zext i32 %b to i64 + %c = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %conv6) + store i64 %c, i64* %out + %d = extractelement <2 x i64> %x, i32 1 + %e = extractelement <4 x i32> %y, i32 3 + %conv17 = zext i32 %e to i64 + %f = tail call i64 @llvm.fshl.i64(i64 %d, i64 %d, i64 %conv17) + %arrayidx2 = getelementptr inbounds i64, i64* %out, i32 1 + store i64 %f, i64* %arrayidx2 + ret void +} + +declare i64 @llvm.fshl.i64(i64, i64, i64) + +attributes #0 = {"target-cpu"="generic" "target-features"="+simd128"}