diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -706,6 +706,14 @@ unsigned AddrSpace = 0, Instruction *I = nullptr) const; + /// Checks if the specified operation with the given vector type is not going + /// to be scalarized. + bool isLegalVectorOp(unsigned, VectorType *) const; + + /// Checks if the specified operation(intrinsic) with the given vector type is + /// not going to be scalarized. + bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const; + /// Return true if LSR cost of C1 is lower than C2. bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const; @@ -1757,6 +1765,10 @@ int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I) = 0; + virtual bool isLegalVectorOp(unsigned, VectorType *) const = 0; + + virtual bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const = 0; + virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) = 0; virtual bool isNumRegsMajorCostOfLSR() = 0; @@ -2198,6 +2210,15 @@ return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale, AddrSpace, I); } + bool isLegalVectorOp(unsigned Opcode, VectorType *VecTy) const override { + return Impl.isLegalVectorOp(Opcode, VecTy); + } + + bool isLegalVectorIntrinsic(Intrinsic::ID Id, + VectorType *VecTy) const override { + return Impl.isLegalVectorIntrinsic(Id, VecTy); + } + bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) override { return Impl.isLSRCostLess(C1, C2); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -298,6 +298,12 @@ bool isLegalMaskedExpandLoad(Type *DataType) const { return false; } + bool isLegalVectorOp(unsigned, VectorType *) const { return true; } + + bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const { + return true; + } + bool enableOrderedReductions() const { return false; } bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -342,6 +342,61 @@ return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); } + bool isLegalVectorOp(unsigned Opcode, VectorType *VecTy) const { + int ISD = getTLI()->InstructionOpcodeToISD(Opcode); + EVT VT = getTLI()->getValueType(DL, VecTy); + TargetLoweringBase::LegalizeKind LK = + getTLI()->getTypeConversion(VecTy->getContext(), VT); + return LK.first != TargetLoweringBase::TypeScalarizeVector && + getTLI()->getOperationAction(ISD, LK.second) != + TargetLowering::Expand; + } + + bool isLegalVectorIntrinsic(Intrinsic::ID Id, VectorType *VecTy) const { + unsigned ISD = ISD::DELETED_NODE; + switch (Id) { + default: + break; + case Intrinsic::exp: + ISD = ISD::FEXP; + break; + case Intrinsic::exp2: + ISD = ISD::FEXP2; + break; + case Intrinsic::log: + ISD = ISD::FLOG; + break; + case Intrinsic::log2: + ISD = ISD::FLOG2; + break; + case Intrinsic::log10: + ISD = ISD::FLOG10; + break; + case Intrinsic::sin: + ISD = ISD::FSIN; + break; + case Intrinsic::cos: + ISD = ISD::FSIN; + break; + case Intrinsic::umax: + ISD = ISD::UMAX; + break; + case Intrinsic::umin: + ISD = ISD::UMIN; + break; + case Intrinsic::sqrt: + ISD = ISD::FSQRT; + break; + } + + if (ISD == ISD::DELETED_NODE) + return true; + EVT VT = getTLI()->getValueType(DL, VecTy); + return getTLI()->getTypeAction(VecTy->getContext(), VT) != + TargetLoweringBase::TypeScalarizeVector && + getTLI()->getOperationAction(ISD, VT) != TargetLowering::Expand; + } + unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const { auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -393,6 +393,16 @@ Scale, AddrSpace, I); } +bool TargetTransformInfo::isLegalVectorOp(unsigned Opcode, + VectorType *VecTy) const { + return TTIImpl->isLegalVectorOp(Opcode, VecTy); +} + +bool TargetTransformInfo::isLegalVectorIntrinsic(Intrinsic::ID Id, + VectorType *VecTy) const { + return TTIImpl->isLegalVectorIntrinsic(Id, VecTy); +} + bool TargetTransformInfo::isLSRCostLess(const LSRCost &C1, const LSRCost &C2) const { return TTIImpl->isLSRCostLess(C1, C2); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1188,6 +1188,13 @@ UserIgnoreList = nullptr; PostponedGathers.clear(); ValueToGatherNodes.clear(); + OperandsToVectorize.clear(); + } + + /// Returns the list of the operands to try to vectorize later, if the user + /// node was not vectorized. + ArrayRef> operandsToVectorize() const { + return OperandsToVectorize; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -2426,6 +2433,10 @@ bool areAllUsersVectorized(Instruction *I, ArrayRef VectorizedVals) const; + /// Checks if the list of the values worth to be vectorized and not going to + /// be scalarized later. + bool isLegalVectorOp(ArrayRef VL); + /// Return information about the vector formed for the specified index /// of a vector of (the same) instruction. TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef VL, @@ -2945,6 +2956,10 @@ /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; + /// A list of the operands of the nodes, which are not vectorized. These + /// operands are the candidates for the vectorization later. + SmallVector> OperandsToVectorize; + /// A map between the vectorized entries and the last instructions in the /// bundles. The bundles are built in use order, not in the def order of the /// instructions. So, we cannot rely directly on the last instruction in the @@ -5447,6 +5462,21 @@ if (!TryToFindDuplicates(S)) return; + // Check if the generated vector instruction won't be scalarized later. + if (!isLegalVectorOp(VL)) { + LLVM_DEBUG(dbgs() << "SLP: scalarized bundle starting " << *S.OpValue + << ".\n"); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + // Gather operands to try to vectorize them later. + for (unsigned I = 0, End = S.MainOp->getNumOperands(); I < End; ++I) { + auto &Operands = OperandsToVectorize.emplace_back(); + for (Value *V : VL) + Operands.push_back(cast(V)->getOperand(I)); + } + return; + } + auto &BSRef = BlocksSchedules[BB]; if (!BSRef) BSRef = std::make_unique(BB); @@ -6365,6 +6395,75 @@ return I->getOpcode() == AltOp->getOpcode(); } +bool BoUpSLP::isLegalVectorOp(ArrayRef VL) { + InstructionsState S = getSameOpcode(VL, *TLI); + const unsigned Sz = VL.size(); + Value *V0 = VL.front(); + Type *ScalarTy = V0->getType(); + if (isa(V0)) + return true; + if (auto *CI = dyn_cast(V0)) + ScalarTy = CI->getOperand(0)->getType(); + else if (auto *CI = dyn_cast(V0)) + if (!isa(CI)) + ScalarTy = CI->getSrcTy(); + if (!isValidElementType(ScalarTy)) + return false; + auto *VecTy = FixedVectorType::get(ScalarTy, Sz); + + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + const auto It = MinBWs.find(VL[0]); + if (It != MinBWs.end()) + VecTy = FixedVectorType::get( + IntegerType::get(F->getContext(), It->second.first), VL.size()); + + unsigned ShuffleOrOp = + S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); + switch (ShuffleOrOp) { + case Instruction::URem: + case Instruction::SRem: + case Instruction::UDiv: + case Instruction::SDiv: { + // Check if it can be represented as shift + TTI::OperandValueInfo OVI = getOperandInfo(VL, 1); + if (OVI.isConstant()) + return true; + return TTI->isLegalVectorOp(ShuffleOrOp, VecTy); + } + case Instruction::Mul: { + // Check if it can be represented as shift + TTI::OperandValueInfo OVI = getOperandInfo(VL, 1); + if (OVI.isConstant()) + return true; + return TTI->isLegalVectorOp(ShuffleOrOp, VecTy); + } + case Instruction::FNeg: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return TTI->isLegalVectorOp(ShuffleOrOp, VecTy); + case Instruction::Call: { + auto *CI = cast(V0); + auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); + return (VecCallCosts.first > VecCallCosts.second || + TTI->isLegalVectorIntrinsic(CI->getIntrinsicID(), VecTy)); + } + default: + return true; + } +} + TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef VL, unsigned OpIdx) { assert(!VL.empty()); @@ -12218,6 +12317,51 @@ return Changed; } +static bool vectorizeOperands(BoUpSLP &R) { + SmallVector> Operands(R.operandsToVectorize().begin(), + R.operandsToVectorize().end()); + DenseSet VisitedOperands; + bool Changed = false; + while (!Operands.empty()) { + SmallVector Chain = Operands.pop_back_val(); + if (!VisitedOperands.insert(hash_value(ArrayRef(Chain))).second) + continue; + unsigned VF = Chain.size(); + R.buildTree(Chain); + if (R.isTreeTinyAndNotFullyVectorizable()) + return false; + if (R.isLoadCombineCandidate()) + return false; + R.reorderTopToBottom(); + R.reorderBottomToTop(); + R.buildExternalUses(); + + R.computeMinimumValueSizes(); + + InstructionCost Cost = R.getTreeCost(); + + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF + << "\n"); + if (Cost < -SLPCostThreshold) { + LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); + + using namespace ore; + + R.getORE()->emit(OptimizationRemark(SV_NAME, "OperandsVectorized", + cast(Chain[0])) + << "Operands SLP vectorized with cost " + << NV("Cost", Cost) << " and with tree size " + << NV("TreeSize", R.getTreeSize())); + + R.vectorizeTree(); + Changed = true; + } + Operands.append(R.operandsToVectorize().begin(), + R.operandsToVectorize().end()); + } + return Changed; +} + bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, unsigned Idx, unsigned MinVF) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() @@ -12257,10 +12401,11 @@ << NV("TreeSize", R.getTreeSize())); R.vectorizeTree(); + (void)vectorizeOperands(R); return true; } - return false; + return vectorizeOperands(R); } bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, @@ -12555,6 +12700,7 @@ NextInst = I + 1; Changed = true; } + Changed |= vectorizeOperands(R); } } @@ -13549,6 +13695,7 @@ // Vectorize a tree. Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues, ReplacedExternals, InsertPt); + (void)vectorizeOperands(V); Builder.SetInsertPoint(InsertPt); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -24,11 +24,13 @@ ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -997,11 +999,13 @@ ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -24,11 +24,13 @@ ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -997,11 +999,13 @@ ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -6,12 +6,26 @@ ; Simple 3-pair chain with loads and stores define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c) { -; GCN-LABEL: @test1_as_3_3_3_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 -; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] -; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: ret void +; GFX9-LABEL: @test1_as_3_3_3_v2f16( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 +; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]] +; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2 +; GFX9-NEXT: ret void +; +; VI-LABEL: @test1_as_3_3_3_v2f16( +; VI-NEXT: [[I0:%.*]] = load half, ptr addrspace(3) [[A:%.*]], align 2 +; VI-NEXT: [[I1:%.*]] = load half, ptr addrspace(3) [[B:%.*]], align 2 +; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]] +; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1 +; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2 +; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1 +; VI-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2 +; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]] +; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2 +; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1 +; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2 +; VI-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 %i1 = load half, ptr addrspace(3) %b, align 2 @@ -28,12 +42,26 @@ } define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) { -; GCN-LABEL: @test1_as_3_0_0( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2 -; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] -; GCN-NEXT: store <2 x half> [[TMP5]], ptr [[C:%.*]], align 2 -; GCN-NEXT: ret void +; GFX9-LABEL: @test1_as_3_0_0( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2 +; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]] +; GFX9-NEXT: store <2 x half> [[TMP3]], ptr [[C:%.*]], align 2 +; GFX9-NEXT: ret void +; +; VI-LABEL: @test1_as_3_0_0( +; VI-NEXT: [[I0:%.*]] = load half, ptr addrspace(3) [[A:%.*]], align 2 +; VI-NEXT: [[I1:%.*]] = load half, ptr [[B:%.*]], align 2 +; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]] +; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1 +; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2 +; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr [[B]], i64 1 +; VI-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX4]], align 2 +; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]] +; VI-NEXT: store half [[MUL]], ptr [[C:%.*]], align 2 +; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr [[C]], i64 1 +; VI-NEXT: store half [[MUL5]], ptr [[ARRAYIDX5]], align 2 +; VI-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 %i1 = load half, ptr %b, align 2 @@ -50,12 +78,26 @@ } define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3) %c) { -; GCN-LABEL: @test1_as_0_0_3_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2 -; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] -; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: ret void +; GFX9-LABEL: @test1_as_0_0_3_v2f16( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A:%.*]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2 +; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]] +; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2 +; GFX9-NEXT: ret void +; +; VI-LABEL: @test1_as_0_0_3_v2f16( +; VI-NEXT: [[I0:%.*]] = load half, ptr [[A:%.*]], align 2 +; VI-NEXT: [[I1:%.*]] = load half, ptr [[B:%.*]], align 2 +; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]] +; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr [[A]], i64 1 +; VI-NEXT: [[I3:%.*]] = load half, ptr [[ARRAYIDX3]], align 2 +; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr [[B]], i64 1 +; VI-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX4]], align 2 +; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]] +; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2 +; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1 +; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2 +; VI-NEXT: ret void ; %i0 = load half, ptr %a, align 2 %i1 = load half, ptr %b, align 2 @@ -73,11 +115,11 @@ define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) { ; GCN-LABEL: @test1_fma_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 -; GCN-NEXT: [[TMP6:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP4]], <2 x half> [[TMP6]]) -; GCN-NEXT: store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 +; GCN-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 +; GCN-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP2]], <2 x half> [[TMP3]]) +; GCN-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[D:%.*]], align 2 ; GCN-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 @@ -98,13 +140,24 @@ } define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, ptr addrspace(3) %c) { -; GCN-LABEL: @mul_scalar_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0 -; GCN-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x half> [[TMP3]], <2 x half> poison, <2 x i32> zeroinitializer -; GCN-NEXT: [[TMP4:%.*]] = fmul <2 x half> [[TMP2]], [[SHUFFLE]] -; GCN-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: ret void +; GFX9-LABEL: @mul_scalar_v2f16( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0 +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <2 x i32> zeroinitializer +; GFX9-NEXT: [[TMP4:%.*]] = fmul <2 x half> [[TMP1]], [[TMP3]] +; GFX9-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2 +; GFX9-NEXT: ret void +; +; VI-LABEL: @mul_scalar_v2f16( +; VI-NEXT: [[I0:%.*]] = load half, ptr addrspace(3) [[A:%.*]], align 2 +; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[SCALAR:%.*]] +; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1 +; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2 +; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[SCALAR]] +; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2 +; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1 +; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2 +; VI-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 %mul = fmul half %i0, %scalar @@ -119,9 +172,9 @@ define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) { ; GCN-LABEL: @fabs_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]]) -; GCN-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]]) +; GCN-NEXT: store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2 ; GCN-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 @@ -137,12 +190,12 @@ define amdgpu_kernel void @test1_fabs_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) { ; GCN-LABEL: @test1_fabs_fma_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 -; GCN-NEXT: [[TMP6:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]]) -; GCN-NEXT: [[TMP8:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP7]], <2 x half> [[TMP4]], <2 x half> [[TMP6]]) -; GCN-NEXT: store <2 x half> [[TMP8]], ptr addrspace(3) [[D:%.*]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 +; GCN-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 +; GCN-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]]) +; GCN-NEXT: [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP4]], <2 x half> [[TMP2]], <2 x half> [[TMP3]]) +; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2 ; GCN-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 @@ -172,12 +225,12 @@ ; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]]) ; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1 ; GCN-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2 -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 -; GCN-NEXT: [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1 -; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]]) -; GCN-NEXT: store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 +; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 +; GCN-NEXT: [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[I4]], i32 1 +; GCN-NEXT: [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP4]], <2 x half> [[TMP2]]) +; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2 ; GCN-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 @@ -201,9 +254,9 @@ define amdgpu_kernel void @canonicalize_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) { ; GFX9-LABEL: @canonicalize_v2f16( -; GFX9-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GFX9-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP2]]) -; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2 +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP1]]) +; GFX9-NEXT: store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2 ; GFX9-NEXT: ret void ; ; VI-LABEL: @canonicalize_v2f16( diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll @@ -208,11 +208,13 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[VECINS_31]] +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] ; ; DEFAULT-LABEL: define <4 x float> @int_exp_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -224,11 +226,13 @@ ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) -; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -314,11 +318,13 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[VECINS_31]] +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] ; ; DEFAULT-LABEL: define <4 x float> @int_log_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -330,11 +336,13 @@ ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) -; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -420,11 +428,13 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[VECINS_31]] +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] ; ; DEFAULT-LABEL: define <4 x float> @int_sin_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -436,11 +446,13 @@ ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll @@ -3,7 +3,21 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { ; CHECK-LABEL: @sdiv_v8i32_undefs( -; CHECK-NEXT: ret <8 x i32> poison +; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 +; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 +; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[R71]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll @@ -8,10 +8,10 @@ ; Base case with no interesting control dependencies define void @test_no_control(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test_no_control( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -34,11 +34,11 @@ ; CHECK-LABEL: @test1( ; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -60,11 +60,11 @@ ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %c1 = load i64, ptr %c @@ -87,11 +87,11 @@ ; CHECK-LABEL: @test3( ; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -113,11 +113,11 @@ ; CHECK-LABEL: @test4( ; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -139,11 +139,11 @@ ; CHECK-LABEL: @test5( ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %a2 = getelementptr i64, ptr %a, i32 1 @@ -164,10 +164,10 @@ define void @test6(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -200,11 +200,11 @@ ; CHECK-NEXT: store i64 0, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -233,11 +233,11 @@ ; CHECK-NEXT: store i64 0, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]] ; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -266,11 +266,11 @@ ; CHECK-NEXT: store i64 0, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() ; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -294,18 +294,18 @@ ; A variant of test7 which shows the same problem with a non-load instruction define void @test10(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test10( -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[A]], i32 1 -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[U1:%.*]] = udiv i64 200, [[V1]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[U1:%.*]] = udiv i64 200, [[TMP2]] ; CHECK-NEXT: store i64 [[U1]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[V2]] -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @may_inf_loop_ro() +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[U2]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP7]], [[TMP5]] +; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -337,11 +337,11 @@ ; CHECK-NEXT: store i64 [[U1]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[Y:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %u1 = udiv i64 200, %x diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll @@ -10,19 +10,26 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[D0:%.*]] = sdiv i64 [[P0]], [[P0]] +; CHECK-NEXT: [[D1:%.*]] = sdiv i64 [[P1]], [[P1]] +; CHECK-NEXT: [[D2:%.*]] = sdiv i64 [[P2]], [[P2]] +; CHECK-NEXT: [[D3:%.*]] = sdiv i64 [[P3]], [[P3]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[D0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[D1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[D2]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[D3]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i64> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shl <4 x i64> [[TMP4]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[D0]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[D1]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i64> [[TMP13]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP19:%.*]], [[BB]] ], [ [[TMP17]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP19]] = trunc <4 x i64> [[TMP11]] to <4 x i32> ; CHECK-NEXT: br label [[BB]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -9,30 +9,32 @@ define void @test() { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 ; CHECK-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 ; CHECK-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 ; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]]) -; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]] -; CHECK-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8 +; CHECK-NEXT: [[SIN0:%.*]] = call fast double @llvm.sin.f64(double [[A2]]) +; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]]) +; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]]) +; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]]) +; CHECK-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 +; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A4]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A5]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[SIN3]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[SIN0]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[SIN2]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP12]] +; CHECK-NEXT: store <2 x double> [[TMP13]], ptr @dst, align 8 ; CHECK-NEXT: ret void ; %a0 = load double, ptr @src, align 8