diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5782,6 +5782,68 @@ }); } +namespace { +// Helper to keep track of the extracted elements to compute an accumulated +// scalarization extraction cost. +struct ScalarizationOverheadBuilder { + // Keep track of demanded elements by source vector or type. + DenseMap ExtractByClass; + DenseMap ExtractByType; + + // TODO: Add getExtractWithExtendCost support to getScalarizationOverhead. + struct ExtractWithExtendOps { + unsigned Opcode; + VectorType *VecTy; + Type *SclTy; + unsigned Idx; + }; + SmallVector ExtractWithExtends; + + // Add an extraction from a specific source and element index. + void addExtract(Value *Src, unsigned Idx) { + if (ExtractByClass.count(Src)) { + ExtractByClass[Src].setBit(Idx); + return; + } + auto *Ty = cast(Src->getType()); + unsigned NumElts = Ty->getNumElements(); + ExtractByClass[Src] = APInt::getOneBitSet(NumElts, Idx); + } + + // Add an extraction from a vector type and specific element index. + // We assume that all extractions from a given type are from the same source. + void addExtract(FixedVectorType *VecTy, unsigned Idx) { + if (ExtractByType.count(VecTy)) { + ExtractByType[VecTy].setBit(Idx); + return; + } + unsigned NumElts = VecTy->getNumElements(); + ExtractByType[VecTy] = APInt::getOneBitSet(NumElts, Idx); + } + + // Add an extended extraction from a specific source and element index. + void addExtractWithExtend(unsigned Opcode, Type *SclTy, + VectorType *VecTy, + unsigned Idx) { + ExtractWithExtends.push_back({Opcode, VecTy, SclTy, Idx}); + } + + // Determine the accumulated scalarization cost for the specified extractions. + InstructionCost getCost(const TargetTransformInfo *TTI) const { + InstructionCost Cost = 0; + for (auto &It : ExtractWithExtends) + Cost += + TTI->getExtractWithExtendCost(It.Opcode, It.SclTy, It.VecTy, It.Idx); + for (auto &It : ExtractByType) + Cost += TTI->getScalarizationOverhead(It.first, It.second, false, true); + for (auto &It : ExtractByClass) + Cost += TTI->getScalarizationOverhead( + cast(It.first->getType()), It.second, false, true); + return Cost; + } +}; +} // namespace + static std::pair getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI) { @@ -6011,7 +6073,7 @@ TargetTransformInfo &TTIRef = *TTI; auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, VectorizedVals, E](InstructionCost &Cost) { - DenseMap ExtractVectorsTys; + ScalarizationOverheadBuilder ScalarizationCost; SmallPtrSet CheckedExtracts; for (auto *V : VL) { if (isa(V)) @@ -6032,12 +6094,6 @@ if (!EEIdx) continue; unsigned Idx = *EEIdx; - if (TTIRef.getNumberOfParts(VecTy) != - TTIRef.getNumberOfParts(EE->getVectorOperandType())) { - auto It = - ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; - It->getSecond() = std::min(It->second, Idx); - } // Take credit for instruction that will become dead. if (EE->hasOneUse()) { Instruction *Ext = EE->user_back(); @@ -6046,9 +6102,9 @@ })) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. - Cost -= - TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), - EE->getVectorOperandType(), Idx); + ScalarizationCost.addExtractWithExtend( + Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(), + Idx); // Add back the cost of s|zext which is subtracted separately. Cost += TTIRef.getCastInstrCost( Ext->getOpcode(), Ext->getType(), EE->getType(), @@ -6056,36 +6112,9 @@ continue; } } - Cost -= TTIRef.getVectorInstrCost(*EE, EE->getVectorOperandType(), Idx); - } - // Add a cost for subvector extracts/inserts if required. - for (const auto &Data : ExtractVectorsTys) { - auto *EEVTy = cast(Data.first->getType()); - unsigned NumElts = VecTy->getNumElements(); - if (Data.second % NumElts == 0) - continue; - if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { - unsigned Idx = (Data.second / NumElts) * NumElts; - unsigned EENumElts = EEVTy->getNumElements(); - if (Idx + NumElts <= EENumElts) { - Cost += - TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, None, CostKind, Idx, VecTy); - } else { - // Need to round up the subvector type vectorization factor to avoid a - // crash in cost model functions. Make SubVT so that Idx + VF of SubVT - // <= EENumElts. - auto *SubVT = - FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); - Cost += - TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, None, CostKind, Idx, SubVT); - } - } else { - Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, - VecTy, None, CostKind, 0, EEVTy); - } + ScalarizationCost.addExtract(EE->getVectorOperand(), Idx); } + Cost -= ScalarizationCost.getCost(&TTIRef); }; if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) @@ -6286,16 +6315,16 @@ case Instruction::ExtractElement: { // The common cost of removal ExtractElement/ExtractValue instructions + // the cost of shuffles, if required to resuffle the original vector. + ScalarizationOverheadBuilder ScalarizationCost, ReuseScalarizationCost; if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast(VL[I]); - CommonCost -= TTI->getVectorInstrCost( - *EE, EE->getVectorOperandType(), *getExtractIndex(EE)); + ReuseScalarizationCost.addExtract(EE->getVectorOperand(), + *getExtractIndex(EE)); } else { - CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); + ReuseScalarizationCost.addExtract(VecTy, Idx); ++Idx; } } @@ -6303,16 +6332,18 @@ for (Value *V : VL) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast(V); - CommonCost += TTI->getVectorInstrCost( - *EE, EE->getVectorOperandType(), *getExtractIndex(EE)); + ScalarizationCost.addExtract(EE->getVectorOperand(), + *getExtractIndex(EE)); } else { --Idx; - CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, - VecTy, Idx); + ScalarizationCost.addExtract(VecTy, Idx); } } + CommonCost -= ReuseScalarizationCost.getCost(TTI); + CommonCost += ScalarizationCost.getCost(TTI); } if (ShuffleOrOp == Instruction::ExtractValue) { + ScalarizationOverheadBuilder ValueScalarizationCost; for (unsigned I = 0, E = VL.size(); I < E; ++I) { auto *EI = cast(VL[I]); // Take credit for instruction that will become dead. @@ -6321,20 +6352,20 @@ if (isa(Ext) && all_of(Ext->users(), [](User *U) { return isa(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - CommonCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, I); - // Add back the cost of s|zext which is subtracted separately. - CommonCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EI->getType(), - TTI::getCastContextHint(Ext), CostKind, Ext); - continue; + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + ValueScalarizationCost.addExtractWithExtend( + Ext->getOpcode(), Ext->getType(), VecTy, I); + // Add back the cost of s|zext which is subtracted separately. + CommonCost += TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EI->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + continue; } } - CommonCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); + ValueScalarizationCost.addExtract(VecTy, I); } + CommonCost -= ValueScalarizationCost.getCost(TTI); } else { AdjustExtractsCost(CommonCost); } @@ -7234,6 +7265,7 @@ SmallVector>> ShuffleMasks; SmallVector> FirstUsers; SmallVector DemandedElts; + ScalarizationOverheadBuilder ScalarizationCost; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!isa_and_nonnull(EU.User) && @@ -7324,20 +7356,20 @@ // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. - auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = FixedVectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); + auto *VecTy = FixedVectorType::get(MinTy, BundleWidth); + ScalarizationCost.addExtractWithExtend(Extend, EU.Scalar->getType(), + VecTy, EU.Lane); } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); + ScalarizationCost.addExtract(VecTy, EU.Lane); } } + ExtractCost += ScalarizationCost.getCost(TTI); InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll --- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll @@ -50,7 +50,7 @@ ; CHECK-NEXT: [[INPUT_SPILL_ADDR_I:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[INPUT_SPILL_ADDR_I]] to i32* ; CHECK-NEXT: store i32 2, i32* [[TMP4]], align 4, !noalias !0 -; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) +; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) ; CHECK-NEXT: [[FRAMEPTR_I1:%.*]] = load %f.Frame*, %f.Frame** [[TMP2]], align 8, !alias.scope !3 ; CHECK-NEXT: [[INPUT_RELOAD_ADDR13_I:%.*]] = getelementptr inbounds [[F_FRAME:%.*]], %f.Frame* [[FRAMEPTR_I1]], i64 0, i32 2 ; CHECK-NEXT: [[INPUT_RELOAD14_I:%.*]] = load i32, i32* [[INPUT_RELOAD_ADDR13_I]], align 4, !noalias !3 @@ -59,16 +59,16 @@ ; CHECK-NEXT: [[SUM7_I:%.*]] = add i32 [[N_VAL3_RELOAD12_I]], [[INPUT_RELOAD14_I]] ; CHECK-NEXT: store i32 [[SUM7_I]], i32* [[N_VAL3_RELOAD_ADDR11_I]], align 4, !noalias !3 ; CHECK-NEXT: store i32 4, i32* [[INPUT_RELOAD_ADDR13_I]], align 4, !noalias !3 -; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) +; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) ; CHECK-NEXT: [[FRAMEPTR_I2:%.*]] = load %f.Frame*, %f.Frame** [[TMP2]], align 8, !alias.scope !6 -; CHECK-NEXT: [[INPUT_RELOAD_ADDR13_I3:%.*]] = getelementptr inbounds [[F_FRAME]], %f.Frame* [[FRAMEPTR_I2]], i64 0, i32 2 -; CHECK-NEXT: [[INPUT_RELOAD14_I4:%.*]] = load i32, i32* [[INPUT_RELOAD_ADDR13_I3]], align 4, !noalias !6 ; CHECK-NEXT: [[N_VAL3_RELOAD_ADDR11_I5:%.*]] = getelementptr inbounds [[F_FRAME]], %f.Frame* [[FRAMEPTR_I2]], i64 0, i32 1 -; CHECK-NEXT: [[N_VAL3_RELOAD12_I6:%.*]] = load i32, i32* [[N_VAL3_RELOAD_ADDR11_I5]], align 4, !noalias !6 -; CHECK-NEXT: [[SUM7_I7:%.*]] = add i32 [[N_VAL3_RELOAD12_I6]], [[INPUT_RELOAD14_I4]] -; CHECK-NEXT: call void @print(i32 [[SUM7_I7]]), !noalias !6 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast %f.Frame* [[FRAMEPTR_I2]] to i8* -; CHECK-NEXT: call void @deallocate(i8* [[TMP5]]), !noalias !6 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[N_VAL3_RELOAD_ADDR11_I5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[N_VAL3_RELOAD_ADDR11_I5]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 +; CHECK-NEXT: [[SUM7_I7:%.*]] = add i32 [[TMP5]], [[TMP7]] +; CHECK-NEXT: tail call void @print(i32 [[SUM7_I7]]), !noalias !6 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast %f.Frame* [[FRAMEPTR_I2]] to i8* +; CHECK-NEXT: tail call void @deallocate(i8* [[TMP8]]), !noalias !6 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -274,7 +274,6 @@ ; PR43745 - https://bugs.llvm.org/show_bug.cgi?id=43745 -; FIXME: this should be vectorized define i1 @cmp_lt_gt(double %a, double %b, double %c) { ; CHECK-LABEL: @cmp_lt_gt( ; CHECK-NEXT: entry: @@ -288,17 +287,16 @@ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i64 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i64 0 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[CMP4]], i1 false +; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i64 1 +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[TMP10]], i1 [[TMP9]], i1 false ; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i64 1 -; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[TMP12]], i1 true, i1 [[TMP11]] +; CHECK-NEXT: [[TMP11:%.*]] = fcmp ule <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1 +; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[TMP13]], i1 true, i1 [[TMP12]] ; CHECK-NEXT: br label [[CLEANUP]] ; CHECK: cleanup: ; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[OR_COND1]], [[LOR_LHS_FALSE]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll @@ -29,20 +29,19 @@ ; SSE-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer ; SSE-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> ; SSE-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; SSE-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 -; SSE-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 -; SSE-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 -; SSE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 -; SSE-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i8>, ptr [[ARRAYIDX_13]], align 1 +; SSE-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[TMP7]], zeroinitializer +; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; SSE-NEXT: [[OR_13:%.*]] = select i1 [[TMP9]], i64 0, i64 8192 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; SSE-NEXT: [[OR_14:%.*]] = select i1 [[TMP10]], i64 0, i64 16384 ; SSE-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; SSE-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 +; SSE-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; SSE-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP11]], 0 ; SSE-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; SSE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) -; SSE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] +; SSE-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) +; SSE-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) +; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP12]], [[TMP13]] ; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_13]] ; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]] ; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] @@ -63,20 +62,19 @@ ; AVX-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer ; AVX-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> ; AVX-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; AVX-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 -; AVX-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 -; AVX-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 -; AVX-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 -; AVX-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 -; AVX-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 -; AVX-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 +; AVX-NEXT: [[TMP7:%.*]] = load <2 x i8>, ptr [[ARRAYIDX_13]], align 1 +; AVX-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[TMP7]], zeroinitializer +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; AVX-NEXT: [[OR_13:%.*]] = select i1 [[TMP9]], i64 0, i64 8192 +; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; AVX-NEXT: [[OR_14:%.*]] = select i1 [[TMP10]], i64 0, i64 16384 ; AVX-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; AVX-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; AVX-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 +; AVX-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; AVX-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP11]], 0 ; AVX-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; AVX-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) -; AVX-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] +; AVX-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) +; AVX-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) +; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP12]], [[TMP13]] ; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_13]] ; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]] ; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] @@ -208,19 +206,18 @@ ; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer ; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 -; SSE-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 -; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 -; SSE-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 -; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_5]], align 2 +; SSE-NEXT: [[TMP5:%.*]] = icmp eq <2 x i16> [[TMP4]], zeroinitializer +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TMP6]], i64 0, i64 32 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TMP7]], i64 0, i64 64 ; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 -; SSE-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 +; SSE-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP8]], 0 ; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] +; SSE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[OR_5]] ; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] ; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] ; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] @@ -236,19 +233,18 @@ ; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer ; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 -; AVX-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 -; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 -; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 -; AVX-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 -; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 -; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; AVX-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_5]], align 2 +; AVX-NEXT: [[TMP5:%.*]] = icmp eq <2 x i16> [[TMP4]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TMP6]], i64 0, i64 32 +; AVX-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TMP7]], i64 0, i64 64 ; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 -; AVX-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 +; AVX-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP8]], 0 ; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] +; AVX-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[OR_5]] ; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] ; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] ; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] @@ -333,19 +329,18 @@ ; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer ; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 -; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 -; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 -; SSE-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 -; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_5]], align 4 +; SSE-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[TMP4]], zeroinitializer +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TMP6]], i64 0, i64 32 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TMP7]], i64 0, i64 64 ; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 -; SSE-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 +; SSE-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 +; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP8]], 0 ; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] +; SSE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[OR_5]] ; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] ; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] ; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] @@ -361,19 +356,18 @@ ; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer ; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 -; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 -; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 -; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 -; AVX-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 -; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 -; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; AVX-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_5]], align 4 +; AVX-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[TMP4]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TMP6]], i64 0, i64 32 +; AVX-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TMP7]], i64 0, i64 64 ; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 -; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 -; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 +; AVX-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 +; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP8]], 0 ; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] +; AVX-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[OR_5]] ; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] ; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] ; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] @@ -504,15 +498,14 @@ ; SSE4-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0 ; SSE4-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 ; SSE4-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6 -; SSE4-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8 -; SSE4-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i64 [[TMP5]], 0 -; SSE4-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; SSE4-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7 -; SSE4-NEXT: [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 -; SSE4-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0 -; SSE4-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE4-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] +; SSE4-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_6]], align 8 +; SSE4-NEXT: [[TMP6:%.*]] = icmp eq <2 x i64> [[TMP5]], zeroinitializer +; SSE4-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; SSE4-NEXT: [[OR_6:%.*]] = select i1 [[TMP7]], i64 0, i64 64 +; SSE4-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; SSE4-NEXT: [[OR_7:%.*]] = select i1 [[TMP8]], i64 0, i64 128 +; SSE4-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[OR_5]] ; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] ; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] ; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] @@ -532,15 +525,14 @@ ; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0 ; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 ; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6 -; AVX-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8 -; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i64 [[TMP5]], 0 -; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7 -; AVX-NEXT: [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 -; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0 -; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] +; AVX-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_6]], align 8 +; AVX-NEXT: [[TMP6:%.*]] = icmp eq <2 x i64> [[TMP5]], zeroinitializer +; AVX-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TMP7]], i64 0, i64 64 +; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TMP8]], i64 0, i64 128 +; AVX-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[OR_5]] ; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] ; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] ; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX %struct.ray = type { %struct.vec3, %struct.vec3 } %struct.vec3 = type { double, double, double } @@ -9,86 +9,166 @@ %struct.material = type { %struct.vec3, double, double } define i32 @ray_sphere(ptr nocapture noundef readonly %sph, ptr nocapture noundef readonly byval(%struct.ray) align 8 %ray, ptr nocapture noundef readnone %sp) { -; CHECK-LABEL: @ray_sphere( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1 -; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[DIR]], align 8 -; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[Y]], align 8 -; CHECK-NEXT: [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]]) -; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[Z]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]]) -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[RAY]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8 -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00 -; CHECK-NEXT: [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[Y19]], align 8 -; CHECK-NEXT: [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[Y21]], align 8 -; CHECK-NEXT: [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]] -; CHECK-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]]) -; CHECK-NEXT: [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00 -; CHECK-NEXT: [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr [[Z28]], align 8 -; CHECK-NEXT: [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = load double, ptr [[Z30]], align 8 -; CHECK-NEXT: [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]]) -; CHECK-NEXT: [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]]) -; CHECK-NEXT: [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]]) -; CHECK-NEXT: [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]]) -; CHECK-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]]) -; CHECK-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]]) -; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[TMP6]] -; CHECK-NEXT: [[TMP18:%.*]] = fneg double [[TMP8]] -; CHECK-NEXT: [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]]) -; CHECK-NEXT: [[NEG78:%.*]] = fneg double [[TMP11]] -; CHECK-NEXT: [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]]) -; CHECK-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]]) -; CHECK-NEXT: [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1 -; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[RAD]], align 8 -; CHECK-NEXT: [[NEG82:%.*]] = fneg double [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]]) -; CHECK-NEXT: [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00 -; CHECK-NEXT: [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00 -; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] -; CHECK: if.end: -; CHECK-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) -; CHECK-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] -; CHECK-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 -; CHECK-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[MUL88]], i32 1 -; CHECK-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1 -; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0 -; CHECK-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false -; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] -; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1 -; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]] -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 -; CHECK-NEXT: br label [[CLEANUP]] -; CHECK: cleanup: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ] -; CHECK-NEXT: ret i32 [[RETVAL_0]] +; SSE-LABEL: @ray_sphere( +; SSE-NEXT: entry: +; SSE-NEXT: [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1 +; SSE-NEXT: [[TMP0:%.*]] = load double, ptr [[DIR]], align 8 +; SSE-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1 +; SSE-NEXT: [[TMP1:%.*]] = load double, ptr [[Y]], align 8 +; SSE-NEXT: [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]] +; SSE-NEXT: [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]]) +; SSE-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2 +; SSE-NEXT: [[TMP3:%.*]] = load double, ptr [[Z]], align 8 +; SSE-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]]) +; SSE-NEXT: [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00 +; SSE-NEXT: [[TMP5:%.*]] = load double, ptr [[RAY]], align 8 +; SSE-NEXT: [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8 +; SSE-NEXT: [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]] +; SSE-NEXT: [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00 +; SSE-NEXT: [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1 +; SSE-NEXT: [[TMP7:%.*]] = load double, ptr [[Y19]], align 8 +; SSE-NEXT: [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1 +; SSE-NEXT: [[TMP8:%.*]] = load double, ptr [[Y21]], align 8 +; SSE-NEXT: [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]] +; SSE-NEXT: [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]] +; SSE-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]]) +; SSE-NEXT: [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00 +; SSE-NEXT: [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2 +; SSE-NEXT: [[TMP10:%.*]] = load double, ptr [[Z28]], align 8 +; SSE-NEXT: [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2 +; SSE-NEXT: [[TMP11:%.*]] = load double, ptr [[Z30]], align 8 +; SSE-NEXT: [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]] +; SSE-NEXT: [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]]) +; SSE-NEXT: [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]] +; SSE-NEXT: [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]]) +; SSE-NEXT: [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]]) +; SSE-NEXT: [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]]) +; SSE-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]]) +; SSE-NEXT: [[FNEG:%.*]] = fneg double [[TMP6]] +; SSE-NEXT: [[TMP18:%.*]] = fneg double [[TMP8]] +; SSE-NEXT: [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]] +; SSE-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]]) +; SSE-NEXT: [[NEG78:%.*]] = fneg double [[TMP11]] +; SSE-NEXT: [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]]) +; SSE-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]]) +; SSE-NEXT: [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1 +; SSE-NEXT: [[TMP22:%.*]] = load double, ptr [[RAD]], align 8 +; SSE-NEXT: [[NEG82:%.*]] = fneg double [[TMP22]] +; SSE-NEXT: [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]]) +; SSE-NEXT: [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00 +; SSE-NEXT: [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]] +; SSE-NEXT: [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]]) +; SSE-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00 +; SSE-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] +; SSE: if.end: +; SSE-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) +; SSE-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] +; SSE-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 +; SSE-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0 +; SSE-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 +; SSE-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] +; SSE-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[MUL88]], i32 1 +; SSE-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] +; SSE-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1 +; SSE-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D +; SSE-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0 +; SSE-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D +; SSE-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false +; SSE-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] +; SSE: lor.lhs.false: +; SSE-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], +; SSE-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1 +; SSE-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]] +; SSE-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 +; SSE-NEXT: br label [[CLEANUP]] +; SSE: cleanup: +; SSE-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ] +; SSE-NEXT: ret i32 [[RETVAL_0]] +; +; AVX-LABEL: @ray_sphere( +; AVX-NEXT: entry: +; AVX-NEXT: [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1 +; AVX-NEXT: [[TMP0:%.*]] = load double, ptr [[DIR]], align 8 +; AVX-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1 +; AVX-NEXT: [[TMP1:%.*]] = load double, ptr [[Y]], align 8 +; AVX-NEXT: [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]] +; AVX-NEXT: [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]]) +; AVX-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2 +; AVX-NEXT: [[TMP3:%.*]] = load double, ptr [[Z]], align 8 +; AVX-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]]) +; AVX-NEXT: [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00 +; AVX-NEXT: [[TMP5:%.*]] = load double, ptr [[RAY]], align 8 +; AVX-NEXT: [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8 +; AVX-NEXT: [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]] +; AVX-NEXT: [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00 +; AVX-NEXT: [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1 +; AVX-NEXT: [[TMP7:%.*]] = load double, ptr [[Y19]], align 8 +; AVX-NEXT: [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1 +; AVX-NEXT: [[TMP8:%.*]] = load double, ptr [[Y21]], align 8 +; AVX-NEXT: [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]] +; AVX-NEXT: [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]] +; AVX-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]]) +; AVX-NEXT: [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00 +; AVX-NEXT: [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2 +; AVX-NEXT: [[TMP10:%.*]] = load double, ptr [[Z28]], align 8 +; AVX-NEXT: [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2 +; AVX-NEXT: [[TMP11:%.*]] = load double, ptr [[Z30]], align 8 +; AVX-NEXT: [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]] +; AVX-NEXT: [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]]) +; AVX-NEXT: [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]] +; AVX-NEXT: [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]]) +; AVX-NEXT: [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]]) +; AVX-NEXT: [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]]) +; AVX-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]]) +; AVX-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]]) +; AVX-NEXT: [[FNEG:%.*]] = fneg double [[TMP6]] +; AVX-NEXT: [[TMP18:%.*]] = fneg double [[TMP8]] +; AVX-NEXT: [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]] +; AVX-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]]) +; AVX-NEXT: [[NEG78:%.*]] = fneg double [[TMP11]] +; AVX-NEXT: [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]]) +; AVX-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]]) +; AVX-NEXT: [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1 +; AVX-NEXT: [[TMP22:%.*]] = load double, ptr [[RAD]], align 8 +; AVX-NEXT: [[NEG82:%.*]] = fneg double [[TMP22]] +; AVX-NEXT: [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]]) +; AVX-NEXT: [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00 +; AVX-NEXT: [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]] +; AVX-NEXT: [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]]) +; AVX-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00 +; AVX-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] +; AVX: if.end: +; AVX-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) +; AVX-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] +; AVX-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 +; AVX-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0 +; AVX-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 +; AVX-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] +; AVX-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 +; AVX-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[MUL88]], i32 1 +; AVX-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] +; AVX-NEXT: [[TMP34:%.*]] = fcmp olt <2 x double> [[TMP33]], +; AVX-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP34]], i32 0 +; AVX-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP34]], i32 1 +; AVX-NEXT: [[OR_COND:%.*]] = select i1 [[TMP36]], i1 [[TMP35]], i1 false +; AVX-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] +; AVX: lor.lhs.false: +; AVX-NEXT: [[TMP37:%.*]] = fcmp ule <2 x double> [[TMP33]], +; AVX-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0 +; AVX-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1 +; AVX-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP39]], i1 true, i1 [[TMP38]] +; AVX-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 +; AVX-NEXT: br label [[CLEANUP]] +; AVX: cleanup: +; AVX-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ] +; AVX-NEXT: ret i32 [[RETVAL_0]] ; entry: %dir = getelementptr inbounds %struct.ray, ptr %ray, i64 0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll @@ -4,16 +4,12 @@ define i32 @crash_reordering_undefs() { ; CHECK-LABEL: @crash_reordering_undefs( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]] -; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537 ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef ; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537 ; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef ; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537 -; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]] -; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537 +; CHECK-NEXT: [[ADD0:%.*]] = select i1 undef, i32 65536, i32 65537 +; CHECK-NEXT: [[ADD9:%.*]] = select i1 undef, i32 65536, i32 65537 ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) ; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP0]], undef ; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD0]], [[ADD2]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -11,36 +11,37 @@ ; CHECK-NEXT: [[TOBOOL_NOT19:%.*]] = icmp eq i32 [[TMP0]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[C_022:%.*]] = phi i32* [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32*> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint i32* [[C_022]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [ +; CHECK-NEXT: [[A_020:%.*]] = phi i32* [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32*> [ [[TMP15:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32*> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint i32* [[TMP2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> +; CHECK-NEXT: switch i32 [[TMP4]], label [[WHILE_BODY_BACKEDGE]] [ ; CHECK-NEXT: i32 2, label [[SW_BB:%.*]] ; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint i32* [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[TMP9]], align 4 -; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32*> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint i32* [[TMP6]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 2 +; CHECK-NEXT: store i32 [[TMP8]], i32* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: sw.bb6: -; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 +; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 2 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32*> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint i32* [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32*> [[TMP5]], i32 1 +; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP14]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: while.body.backedge: -; CHECK-NEXT: [[C_022_BE]] = phi i32* [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] -; CHECK-NEXT: [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ] +; CHECK-NEXT: [[A_020_BE]] = phi i32* [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR7]], [[SW_BB6]] ], [ [[INCDEC_PTR4]], [[SW_BB]] ] +; CHECK-NEXT: [[TMP15]] = phi <2 x i32*> [ [[TMP5]], [[WHILE_BODY]] ], [ [[TMP13]], [[SW_BB6]] ], [ [[TMP9]], [[SW_BB]] ] ; CHECK-NEXT: br label [[WHILE_BODY]] ; CHECK: while.end: ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -96,17 +96,16 @@ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[TMP10]], [[TMP9]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 -; CHECK-NEXT: [[NOT_OR_COND9:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP11:%.*]] = fcmp ule <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: [[NOT_OR_COND9:%.*]] = or i1 [[TMP12]], [[TMP13]] ; CHECK-NEXT: ret i1 [[NOT_OR_COND9]] ; CHECK: cleanup: ; CHECK-NEXT: ret i1 false