diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1143,6 +1143,12 @@ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index = -1) const; + /// \return The cost of the constant buildvector sequence. + InstructionCost + getConstBuildVectorInstrCost(VectorType *VecTy, unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind) const; + /// \return The cost of replication shuffle of \p VF elements typed \p EltTy /// \p ReplicationFactor times. /// @@ -1706,6 +1712,10 @@ virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) = 0; + virtual InstructionCost + getConstBuildVectorInstrCost(VectorType *VecTy, unsigned UserOpcode, + unsigned Idx, TTI::TargetCostKind CostKind) = 0; + virtual InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, @@ -2248,6 +2258,12 @@ return Impl.getVectorInstrCost(Opcode, Val, Index); } InstructionCost + getConstBuildVectorInstrCost(VectorType *VecTy, unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind) override { + return Impl.getConstBuildVectorInstrCost(VecTy, UserOpcode, Idx, CostKind); + } + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -566,6 +566,13 @@ return 1; } + InstructionCost + getConstBuildVectorInstrCost(VectorType *VecTy, unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind) const { + return 0; + } + unsigned getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) { diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1143,6 +1143,13 @@ return LT.first; } + InstructionCost getConstBuildVectorInstrCost(VectorType *VecTy, + unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind) { + return 0; + } + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -852,6 +852,14 @@ return Cost; } +InstructionCost TargetTransformInfo::getConstBuildVectorInstrCost( + VectorType *VecTy, unsigned UserOpcode, unsigned Idx, + TTI::TargetCostKind CostKind) const { + InstructionCost Cost = + TTIImpl->getConstBuildVectorInstrCost(VecTy, UserOpcode, Idx, CostKind); + return Cost; +} + InstructionCost TargetTransformInfo::getReplicationShuffleCost( Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -105,6 +105,11 @@ Optional FMF, TTI::TargetCostKind CostKind); + InstructionCost getConstBuildVectorInstrCost(VectorType *VecTy, + unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind); + bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) { if (!ST->hasVInstructions()) return false; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -346,6 +346,33 @@ return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); } +InstructionCost +RISCVTTIImpl::getConstBuildVectorInstrCost(VectorType *VecTy, + unsigned UserOpcode, unsigned Idx, + TTI::TargetCostKind CostKind) { + InstructionCost VecCost = 0; + switch (UserOpcode) { + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + if (Idx == 1) + return TTI::TCC_Free; + LLVM_FALLTHROUGH; + default: + // Add a cost of address load + the cost of the vector load. + VecCost = + getMemoryOpCost( + Instruction::Load, + VecTy->getElementType()->getPointerTo(/*AddressSpace=*/0), + DL.getPointerABIAlignment(/*AddressSpace=*/0), + /*AddressSpace=*/0, CostKind) + + getMemoryOpCost(Instruction::Load, VecTy, DL.getABITypeAlign(VecTy), + /*AddressSpace=*/0, CostKind); + break; + } + return VecCost; +} + void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -148,6 +148,10 @@ const Instruction *I = nullptr); InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + InstructionCost getConstBuildVectorInstrCost(VectorType *VecTy, + unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind); InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3775,6 +3775,27 @@ return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } +InstructionCost +X86TTIImpl::getConstBuildVectorInstrCost(VectorType *VecTy, unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind) { + InstructionCost VecCost = 0; + switch (UserOpcode) { + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + if (Idx == 1) + return TTI::TCC_Free; + LLVM_FALLTHROUGH; + default: + VecCost = + getMemoryOpCost(Instruction::Load, VecTy, DL.getABITypeAlign(VecTy), + /*AddressSpace=*/0, CostKind); + break; + } + return VecCost; +} + InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5761,8 +5761,55 @@ } }; if (E->State == TreeEntry::NeedToGather) { - if (allConstant(VL)) - return 0; + if (allConstant(VL)) { + if (all_of(VL, [](Value *V) { + if (isa(V)) + return true; + auto *C = cast(V); + return C->isZeroValue(); + })) + return 0; + // Check if we have same buildvector already. + if (any_of(VectorizableTree, [E, + VL](const std::unique_ptr &TE) { + return TE->State == TreeEntry::NeedToGather && TE.get() != E && + TE->Idx > E->Idx && TE->isSame(VL); + })) + return 0; + // TODO: improve opcode and idx for alternate opcodes. + unsigned UserOpcode = + (E->UserTreeIndices.empty() || + E->UserTreeIndices.front().UserTE->isAltShuffle()) + ? 0 + : E->UserTreeIndices.front().UserTE->getOpcode(); + // For reductions, user reduction opcode as user opcode. + if (E->UserTreeIndices.empty()) + UserOpcode = cast(*UserIgnoreList->begin())->getOpcode(); + unsigned UserIdx = + (E->UserTreeIndices.empty() || + E->UserTreeIndices.front().UserTE->isAltShuffle()) + ? 0 + : E->UserTreeIndices.front().EdgeIdx; + InstructionCost VecCost = TTI->getConstBuildVectorInstrCost( + FinalVecTy, UserOpcode, UserIdx, CostKind); + InstructionCost ScalarCost = 0; + if (!ScalarTy->isIntegerTy()) { + ScalarCost += TTI->getMemoryOpCost(Instruction::Load, ScalarTy, + DL->getABITypeAlign(ScalarTy), + /*AddressSpace=*/0, CostKind) * + EntryVF; + } else { + InstructionCost ScalarCost = 0; + for (Value *V : VL) { + if (isa(V)) + continue; + auto *CI = cast(V); + ScalarCost += TTI->getIntImmCostInst( + UserOpcode, UserIdx, CI->getValue(), ScalarTy, CostKind); + } + } + return VecCost - ScalarCost; + } if (isa(VL[0])) return InstructionCost::getInvalid(); SmallVector Mask; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -288,17 +288,16 @@ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i64 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i64 0 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[CMP4]], i1 false +; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i64 1 +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[TMP10]], i1 [[TMP9]], i1 false ; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i64 1 -; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[TMP12]], i1 true, i1 [[TMP11]] +; CHECK-NEXT: [[TMP11:%.*]] = fcmp ule <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1 +; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[TMP13]], i1 true, i1 [[TMP12]] ; CHECK-NEXT: br label [[CLEANUP]] ; CHECK: cleanup: ; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[OR_COND1]], [[LOR_LHS_FALSE]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll b/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll --- a/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll +++ b/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll @@ -84,11 +84,20 @@ ; TODO: This doesn't require 'nsz'. It should fold to ((x1 - x2) * 47.0) define float @test13_reassoc(float %X1, float %X2) { -; CHECK-LABEL: @test13_reassoc( -; CHECK-NEXT: [[B:%.*]] = fmul reassoc float [[X1:%.*]], 4.700000e+01 -; CHECK-NEXT: [[C:%.*]] = fmul reassoc float [[X2:%.*]], 4.700000e+01 -; CHECK-NEXT: [[TMP1:%.*]] = fsub reassoc float [[B]], [[C]] -; CHECK-NEXT: ret float [[TMP1]] +; REASSOC_AND_IC-LABEL: @test13_reassoc( +; REASSOC_AND_IC-NEXT: [[B:%.*]] = fmul reassoc float [[X1:%.*]], 4.700000e+01 +; REASSOC_AND_IC-NEXT: [[C:%.*]] = fmul reassoc float [[X2:%.*]], 4.700000e+01 +; REASSOC_AND_IC-NEXT: [[TMP1:%.*]] = fsub reassoc float [[B]], [[C]] +; REASSOC_AND_IC-NEXT: ret float [[TMP1]] +; +; O2-LABEL: @test13_reassoc( +; O2-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[X1:%.*]], i64 0 +; O2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[X2:%.*]], i64 1 +; O2-NEXT: [[TMP3:%.*]] = fmul reassoc <2 x float> [[TMP2]], +; O2-NEXT: [[SHIFT:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; O2-NEXT: [[TMP4:%.*]] = fsub reassoc <2 x float> [[TMP3]], [[SHIFT]] +; O2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i64 0 +; O2-NEXT: ret float [[TMP5]] ; %B = fmul reassoc float %X1, 47. ; X1*47 %C = fmul reassoc float %X2, -47. ; X2*-47 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -16,8 +16,8 @@ define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: @gather_multiple_use( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i64 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i64 3 ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -666,19 +666,19 @@ define void @single_membound(double* %arg, double* %arg1, double %x) { ; CHECK-LABEL: @single_membound( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[ARG:%.*]], i64 1 -; CHECK-NEXT: store double [[TMP]], double* [[TMP9]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[ARG1:%.*]], i64 0 +; CHECK-NEXT: [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01 +; CHECK-NEXT: store double [[TMP]], double* [[TMP9]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = fsub double 1.000000e+00, [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 2 ; CHECK-NEXT: br label [[BB15:%.*]] ; CHECK: bb15: -; CHECK-NEXT: [[TMP16:%.*]] = fmul double [[TMP]], 2.000000e+01 -; CHECK-NEXT: store double [[TMP16]], double* [[TMP9]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = fmul double [[TMP13]], 3.000000e+01 -; CHECK-NEXT: store double [[TMP17]], double* [[TMP14]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP9]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -1275,28 +1275,27 @@ ; CHECK-NEXT: [[T19:%.*]] = load float*, float** [[ARG:%.*]], align 8 ; CHECK-NEXT: [[T20:%.*]] = load float, float* [[ARG_3:%.*]], align 4 ; CHECK-NEXT: [[T21:%.*]] = getelementptr inbounds float, float* [[ARG_2:%.*]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> , float [[T20]], i32 1 ; CHECK-NEXT: br i1 [[C:%.*]], label [[BB22:%.*]], label [[BB30:%.*]] ; CHECK: bb22: ; CHECK-NEXT: [[T23:%.*]] = fmul float [[T20]], 9.900000e+01 -; CHECK-NEXT: [[T24:%.*]] = fmul float [[T23]], 9.900000e+01 ; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds float, float* [[T19]], i64 2 -; CHECK-NEXT: [[T26:%.*]] = fmul float [[T23]], 1.000000e+01 -; CHECK-NEXT: store float [[T26]], float* [[T25]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[T23]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: store float [[TMP4]], float* [[T25]], align 4 ; CHECK-NEXT: [[T27:%.*]] = load float, float* [[T21]], align 8 -; CHECK-NEXT: [[T28:%.*]] = fadd float [[T24]], 2.000000e+01 -; CHECK-NEXT: [[T29:%.*]] = fadd float [[T26]], 2.000000e+01 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], ; CHECK-NEXT: br label [[BB30]] ; CHECK: bb30: -; CHECK-NEXT: [[T31:%.*]] = phi float [ [[T28]], [[BB22]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[T32:%.*]] = phi float [ [[T29]], [[BB22]] ], [ [[T20]], [[ENTRY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[BB36:%.*]] ; CHECK: bb36: -; CHECK-NEXT: [[T37:%.*]] = fmul float [[T31]], 3.000000e+00 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], ; CHECK-NEXT: [[T38:%.*]] = getelementptr inbounds float, float* [[ARG_3]], i64 0 -; CHECK-NEXT: store float [[T37]], float* [[T38]], align 4 -; CHECK-NEXT: [[T39:%.*]] = fmul float [[T32]], 3.000000e+00 -; CHECK-NEXT: [[T40:%.*]] = getelementptr inbounds float, float* [[ARG_3]], i64 1 -; CHECK-NEXT: store float [[T39]], float* [[T40]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[T38]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP7]], <2 x float>* [[TMP8]], align 4 ; CHECK-NEXT: br label [[BB41:%.*]] ; CHECK: bb41: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll @@ -12,11 +12,13 @@ define void @foo(i64* nocapture writeonly %da) { ; CHECK-128-LABEL: @foo( ; CHECK-128-NEXT: entry: -; CHECK-128-NEXT: [[TMP0:%.*]] = bitcast i64* [[DA:%.*]] to <2 x i64>* -; CHECK-128-NEXT: store <2 x i64> , <2 x i64>* [[TMP0]], align 8 +; CHECK-128-NEXT: store i64 0, i64* [[DA:%.*]], align 8 +; CHECK-128-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 1 +; CHECK-128-NEXT: store i64 1, i64* [[ARRAYIDX1]], align 8 ; CHECK-128-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 2 -; CHECK-128-NEXT: [[TMP1:%.*]] = bitcast i64* [[ARRAYIDX2]] to <2 x i64>* -; CHECK-128-NEXT: store <2 x i64> , <2 x i64>* [[TMP1]], align 8 +; CHECK-128-NEXT: store i64 2, i64* [[ARRAYIDX2]], align 8 +; CHECK-128-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 3 +; CHECK-128-NEXT: store i64 3, i64* [[ARRAYIDX3]], align 8 ; CHECK-128-NEXT: ret void ; ; CHECK-256-LABEL: @foo( @@ -45,8 +47,9 @@ define void @foo8(i8* nocapture writeonly %da) { ; CHECK-LABEL: @foo8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[DA:%.*]] to <2 x i8>* -; CHECK-NEXT: store <2 x i8> , <2 x i8>* [[TMP0]], align 8 +; CHECK-NEXT: store i8 0, i8* [[DA:%.*]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DA]], i8 1 +; CHECK-NEXT: store i8 1, i8* [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[DA]], i8 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll @@ -7,20 +7,27 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3]] = extractelement <4 x i64> [[TMP2]], i32 3 +; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[LAST:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[FORK:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[INC1:%.*]] = add i64 [[TMP0]], 1 +; CHECK-NEXT: [[INC2:%.*]] = add i64 [[TMP0]], 2 +; CHECK-NEXT: [[INC11:%.*]] = add i64 1, [[INC1]] +; CHECK-NEXT: [[EXACT1:%.*]] = ashr exact i64 [[INC11]], 32 +; CHECK-NEXT: [[INC3:%.*]] = add i64 [[TMP0]], 3 ; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP4]], 32 -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> , [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP3]], 0 -; CHECK-NEXT: [[OP_RDX1]] = add i64 [[TMP7]], [[OP_RDX]] +; CHECK-NEXT: [[INC12:%.*]] = add i64 1, [[INC2]] +; CHECK-NEXT: [[EXACT2:%.*]] = ashr exact i64 [[INC12]], 32 +; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[INC3]], 32 +; CHECK-NEXT: [[INC13:%.*]] = add i64 1, [[INC3]] +; CHECK-NEXT: [[EXACT3:%.*]] = ashr exact i64 [[INC13]], 32 +; CHECK-NEXT: [[FORK]] = add i64 [[TMP0]], 0 +; CHECK-NEXT: [[SUM1:%.*]] = add i64 [[EXACT3]], [[EXACT2]] +; CHECK-NEXT: [[SUM2:%.*]] = add i64 [[SUM1]], [[EXACT1]] +; CHECK-NEXT: [[ZSUM:%.*]] = add i64 [[SUM2]], 0 +; CHECK-NEXT: [[SEXT22:%.*]] = add i64 1, [[FORK]] +; CHECK-NEXT: [[EXACT4:%.*]] = ashr exact i64 [[SEXT22]], 32 +; CHECK-NEXT: [[JOIN:%.*]] = add i64 [[FORK]], [[ZSUM]] +; CHECK-NEXT: [[LAST]] = add i64 [[JOIN]], [[EXACT4]] ; CHECK-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @fadd_fsub_v8f32( @@ -91,28 +91,9 @@ } define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { -; SSE-LABEL: @fmul_fdiv_v4f32_const( -; SSE-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; SSE-NEXT: ret <4 x float> [[TMP1]] -; -; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[A2]], i64 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i64 3 -; SLM-NEXT: ret <4 x float> [[R3]] -; -; AVX-LABEL: @fmul_fdiv_v4f32_const( -; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; AVX-NEXT: ret <4 x float> [[TMP1]] -; -; AVX512-LABEL: @fmul_fdiv_v4f32_const( -; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; AVX512-NEXT: ret <4 x float> [[TMP1]] +; CHECK-LABEL: @fmul_fdiv_v4f32_const( +; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @fadd_fsub_v8f32( @@ -91,28 +91,9 @@ } define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { -; SSE-LABEL: @fmul_fdiv_v4f32_const( -; SSE-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; SSE-NEXT: ret <4 x float> [[TMP1]] -; -; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[A2]], i64 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i64 3 -; SLM-NEXT: ret <4 x float> [[R3]] -; -; AVX-LABEL: @fmul_fdiv_v4f32_const( -; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; AVX-NEXT: ret <4 x float> [[TMP1]] -; -; AVX512-LABEL: @fmul_fdiv_v4f32_const( -; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; AVX512-NEXT: ret <4 x float> [[TMP1]] +; CHECK-LABEL: @fmul_fdiv_v4f32_const( +; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4 +; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE4 ; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX ; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512 @@ -15,39 +15,73 @@ ; } define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) { -; SSE-LABEL: @bitmask_16xi8( -; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 -; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 -; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 -; SSE-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> -; SSE-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 -; SSE-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer -; SSE-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> -; SSE-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; SSE-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 -; SSE-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 -; SSE-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 -; SSE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 -; SSE-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 -; SSE-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; SSE-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 -; SSE-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; SSE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) -; SSE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] -; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]] -; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]] -; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] -; SSE-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]] -; SSE-NEXT: ret i64 [[OP_RDX4]] +; SSE2-LABEL: @bitmask_16xi8( +; SSE2-NEXT: entry: +; SSE2-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 +; SSE2-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 +; SSE2-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 +; SSE2-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer +; SSE2-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> +; SSE2-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 +; SSE2-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 +; SSE2-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; SSE2-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> +; SSE2-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 +; SSE2-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 +; SSE2-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 +; SSE2-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 +; SSE2-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 +; SSE2-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 +; SSE2-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 +; SSE2-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 +; SSE2-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 +; SSE2-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; SSE2-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 +; SSE2-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 +; SSE2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) +; SSE2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] +; SSE2-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]] +; SSE2-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]] +; SSE2-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] +; SSE2-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]] +; SSE2-NEXT: ret i64 [[OP_RDX4]] +; +; SSE4-LABEL: @bitmask_16xi8( +; SSE4-NEXT: entry: +; SSE4-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 +; SSE4-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 +; SSE4-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE4-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; SSE4-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 +; SSE4-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer +; SSE4-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> +; SSE4-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 +; SSE4-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 +; SSE4-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; SSE4-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> +; SSE4-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 +; SSE4-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 +; SSE4-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 +; SSE4-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 +; SSE4-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 +; SSE4-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 +; SSE4-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 +; SSE4-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 +; SSE4-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 +; SSE4-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; SSE4-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 +; SSE4-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 +; SSE4-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) +; SSE4-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) +; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] +; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]] +; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]] +; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] +; SSE4-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]] +; SSE4-NEXT: ret i64 [[OP_RDX4]] ; ; AVX-LABEL: @bitmask_16xi8( ; AVX-NEXT: entry: @@ -97,19 +131,21 @@ ; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer ; AVX512-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> ; AVX512-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; AVX512-NEXT: [[TMP7:%.*]] = load <2 x i8>, ptr [[ARRAYIDX_13]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[TMP7]], zeroinitializer -; AVX512-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i64> zeroinitializer, <2 x i64> +; AVX512-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 +; AVX512-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 +; AVX512-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 +; AVX512-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 +; AVX512-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 +; AVX512-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 +; AVX512-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 ; AVX512-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; AVX512-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; AVX512-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP10]], 0 +; AVX512-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; AVX512-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 ; AVX512-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; AVX512-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) -; AVX512-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP11]], [[TMP12]] -; AVX512-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; AVX512-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP13]], [[TMP14]] +; AVX512-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) +; AVX512-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] +; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]] ; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]] ; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] ; AVX512-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]] @@ -198,33 +234,61 @@ } define i64 @bitmask_4xi16(ptr nocapture noundef readonly %src) { -; SSE-LABEL: @bitmask_4xi16( -; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2 -; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0 -; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2 -; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 -; SSE-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 -; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 -; SSE-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 -; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 -; SSE-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 -; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] -; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] -; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] -; SSE-NEXT: ret i64 [[OP_RDX3]] +; SSE2-LABEL: @bitmask_4xi16( +; SSE2-NEXT: entry: +; SSE2-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2 +; SSE2-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0 +; SSE2-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2 +; SSE2-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer +; SSE2-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> +; SSE2-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 +; SSE2-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 +; SSE2-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 +; SSE2-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; SSE2-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 +; SSE2-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 +; SSE2-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 +; SSE2-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; SSE2-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 +; SSE2-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; SSE2-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 +; SSE2-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 +; SSE2-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] +; SSE2-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] +; SSE2-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] +; SSE2-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] +; SSE2-NEXT: ret i64 [[OP_RDX3]] +; +; SSE4-LABEL: @bitmask_4xi16( +; SSE4-NEXT: entry: +; SSE4-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2 +; SSE4-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0 +; SSE4-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE4-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1 +; SSE4-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2 +; SSE4-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer +; SSE4-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> +; SSE4-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 +; SSE4-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 +; SSE4-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 +; SSE4-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; SSE4-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 +; SSE4-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 +; SSE4-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 +; SSE4-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; SSE4-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 +; SSE4-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; SSE4-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 +; SSE4-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 +; SSE4-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] +; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] +; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] +; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] +; SSE4-NEXT: ret i64 [[OP_RDX3]] ; ; AVX-LABEL: @bitmask_4xi16( ; AVX-NEXT: entry: @@ -264,20 +328,22 @@ ; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer ; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 -; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_5]], align 2 -; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i16> [[TMP4]], zeroinitializer -; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 +; AVX512-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 +; AVX512-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; AVX512-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 +; AVX512-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 +; AVX512-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 +; AVX512-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 ; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 -; AVX512-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP7]], 0 +; AVX512-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 ; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] +; AVX512-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] ; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] ; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] +; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] ; AVX512-NEXT: ret i64 [[OP_RDX3]] ; entry: @@ -323,33 +389,61 @@ } define i64 @bitmask_8xi32(ptr nocapture noundef readonly %src) { -; SSE-LABEL: @bitmask_8xi32( -; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0 -; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4 -; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 -; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 -; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 -; SSE-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 -; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 -; SSE-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 -; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] -; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] -; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] -; SSE-NEXT: ret i64 [[OP_RDX3]] +; SSE2-LABEL: @bitmask_8xi32( +; SSE2-NEXT: entry: +; SSE2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 +; SSE2-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0 +; SSE2-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +; SSE2-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> +; SSE2-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 +; SSE2-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 +; SSE2-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 +; SSE2-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; SSE2-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 +; SSE2-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 +; SSE2-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 +; SSE2-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; SSE2-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 +; SSE2-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 +; SSE2-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 +; SSE2-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 +; SSE2-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] +; SSE2-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] +; SSE2-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] +; SSE2-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] +; SSE2-NEXT: ret i64 [[OP_RDX3]] +; +; SSE4-LABEL: @bitmask_8xi32( +; SSE4-NEXT: entry: +; SSE4-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 +; SSE4-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0 +; SSE4-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE4-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1 +; SSE4-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +; SSE4-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> +; SSE4-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 +; SSE4-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 +; SSE4-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 +; SSE4-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; SSE4-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 +; SSE4-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 +; SSE4-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 +; SSE4-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; SSE4-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 +; SSE4-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 +; SSE4-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 +; SSE4-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 +; SSE4-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] +; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] +; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] +; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] +; SSE4-NEXT: ret i64 [[OP_RDX3]] ; ; AVX-LABEL: @bitmask_8xi32( ; AVX-NEXT: entry: @@ -389,20 +483,22 @@ ; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer ; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 -; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_5]], align 4 -; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[TMP4]], zeroinitializer -; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 +; AVX512-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 +; AVX512-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; AVX512-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 +; AVX512-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 +; AVX512-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 +; AVX512-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 ; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 -; AVX512-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 -; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP7]], 0 +; AVX512-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 +; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 ; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] +; AVX512-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] ; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] ; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] +; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] ; AVX512-NEXT: ret i64 [[OP_RDX3]] ; entry: @@ -556,20 +652,22 @@ ; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer ; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5 -; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_5]], align 8 -; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i64> [[TMP4]], zeroinitializer -; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX_5]], align 8 +; AVX512-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0 +; AVX512-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; AVX512-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6 +; AVX512-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8 +; AVX512-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i64 [[TMP5]], 0 +; AVX512-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 ; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7 -; AVX512-NEXT: [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 -; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP7]], 0 +; AVX512-NEXT: [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 +; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0 ; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] +; AVX512-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] ; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] ; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] +; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] ; AVX512-NEXT: ret i64 [[OP_RDX3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -slp-vectorizer -S | FileCheck %s --check-prefix=SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s --check-prefix=AVX %struct.ray = type { %struct.vec3, %struct.vec3 } %struct.vec3 = type { double, double, double } @@ -9,86 +9,166 @@ %struct.material = type { %struct.vec3, double, double } define i32 @ray_sphere(ptr nocapture noundef readonly %sph, ptr nocapture noundef readonly byval(%struct.ray) align 8 %ray, ptr nocapture noundef readnone %sp) { -; CHECK-LABEL: @ray_sphere( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1 -; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[DIR]], align 8 -; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[Y]], align 8 -; CHECK-NEXT: [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]]) -; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[Z]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]]) -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[RAY]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8 -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00 -; CHECK-NEXT: [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[Y19]], align 8 -; CHECK-NEXT: [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[Y21]], align 8 -; CHECK-NEXT: [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]] -; CHECK-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]]) -; CHECK-NEXT: [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00 -; CHECK-NEXT: [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr [[Z28]], align 8 -; CHECK-NEXT: [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = load double, ptr [[Z30]], align 8 -; CHECK-NEXT: [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]]) -; CHECK-NEXT: [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]]) -; CHECK-NEXT: [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]]) -; CHECK-NEXT: [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]]) -; CHECK-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]]) -; CHECK-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]]) -; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[TMP6]] -; CHECK-NEXT: [[TMP18:%.*]] = fneg double [[TMP8]] -; CHECK-NEXT: [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]]) -; CHECK-NEXT: [[NEG78:%.*]] = fneg double [[TMP11]] -; CHECK-NEXT: [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]]) -; CHECK-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]]) -; CHECK-NEXT: [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1 -; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[RAD]], align 8 -; CHECK-NEXT: [[NEG82:%.*]] = fneg double [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]]) -; CHECK-NEXT: [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00 -; CHECK-NEXT: [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00 -; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] -; CHECK: if.end: -; CHECK-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) -; CHECK-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] -; CHECK-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 -; CHECK-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[MUL88]], i32 1 -; CHECK-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1 -; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0 -; CHECK-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false -; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] -; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1 -; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]] -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 -; CHECK-NEXT: br label [[CLEANUP]] -; CHECK: cleanup: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ] -; CHECK-NEXT: ret i32 [[RETVAL_0]] +; SSE-LABEL: @ray_sphere( +; SSE-NEXT: entry: +; SSE-NEXT: [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1 +; SSE-NEXT: [[TMP0:%.*]] = load double, ptr [[DIR]], align 8 +; SSE-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1 +; SSE-NEXT: [[TMP1:%.*]] = load double, ptr [[Y]], align 8 +; SSE-NEXT: [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]] +; SSE-NEXT: [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]]) +; SSE-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2 +; SSE-NEXT: [[TMP3:%.*]] = load double, ptr [[Z]], align 8 +; SSE-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]]) +; SSE-NEXT: [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00 +; SSE-NEXT: [[TMP5:%.*]] = load double, ptr [[RAY]], align 8 +; SSE-NEXT: [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8 +; SSE-NEXT: [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]] +; SSE-NEXT: [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00 +; SSE-NEXT: [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1 +; SSE-NEXT: [[TMP7:%.*]] = load double, ptr [[Y19]], align 8 +; SSE-NEXT: [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1 +; SSE-NEXT: [[TMP8:%.*]] = load double, ptr [[Y21]], align 8 +; SSE-NEXT: [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]] +; SSE-NEXT: [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]] +; SSE-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]]) +; SSE-NEXT: [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00 +; SSE-NEXT: [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2 +; SSE-NEXT: [[TMP10:%.*]] = load double, ptr [[Z28]], align 8 +; SSE-NEXT: [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2 +; SSE-NEXT: [[TMP11:%.*]] = load double, ptr [[Z30]], align 8 +; SSE-NEXT: [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]] +; SSE-NEXT: [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]]) +; SSE-NEXT: [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]] +; SSE-NEXT: [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]]) +; SSE-NEXT: [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]]) +; SSE-NEXT: [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]]) +; SSE-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]]) +; SSE-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]]) +; SSE-NEXT: [[FNEG:%.*]] = fneg double [[TMP6]] +; SSE-NEXT: [[TMP18:%.*]] = fneg double [[TMP8]] +; SSE-NEXT: [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]] +; SSE-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]]) +; SSE-NEXT: [[NEG78:%.*]] = fneg double [[TMP11]] +; SSE-NEXT: [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]]) +; SSE-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]]) +; SSE-NEXT: [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1 +; SSE-NEXT: [[TMP22:%.*]] = load double, ptr [[RAD]], align 8 +; SSE-NEXT: [[NEG82:%.*]] = fneg double [[TMP22]] +; SSE-NEXT: [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]]) +; SSE-NEXT: [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00 +; SSE-NEXT: [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]] +; SSE-NEXT: [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]]) +; SSE-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00 +; SSE-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] +; SSE: if.end: +; SSE-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) +; SSE-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] +; SSE-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 +; SSE-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 +; SSE-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0 +; SSE-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 +; SSE-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] +; SSE-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 +; SSE-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[MUL88]], i32 1 +; SSE-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] +; SSE-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1 +; SSE-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D +; SSE-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0 +; SSE-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D +; SSE-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false +; SSE-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] +; SSE: lor.lhs.false: +; SSE-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], +; SSE-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0 +; SSE-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1 +; SSE-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]] +; SSE-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 +; SSE-NEXT: br label [[CLEANUP]] +; SSE: cleanup: +; SSE-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ] +; SSE-NEXT: ret i32 [[RETVAL_0]] +; +; AVX-LABEL: @ray_sphere( +; AVX-NEXT: entry: +; AVX-NEXT: [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1 +; AVX-NEXT: [[TMP0:%.*]] = load double, ptr [[DIR]], align 8 +; AVX-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1 +; AVX-NEXT: [[TMP1:%.*]] = load double, ptr [[Y]], align 8 +; AVX-NEXT: [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]] +; AVX-NEXT: [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]]) +; AVX-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2 +; AVX-NEXT: [[TMP3:%.*]] = load double, ptr [[Z]], align 8 +; AVX-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]]) +; AVX-NEXT: [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00 +; AVX-NEXT: [[TMP5:%.*]] = load double, ptr [[RAY]], align 8 +; AVX-NEXT: [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8 +; AVX-NEXT: [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]] +; AVX-NEXT: [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00 +; AVX-NEXT: [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1 +; AVX-NEXT: [[TMP7:%.*]] = load double, ptr [[Y19]], align 8 +; AVX-NEXT: [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1 +; AVX-NEXT: [[TMP8:%.*]] = load double, ptr [[Y21]], align 8 +; AVX-NEXT: [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]] +; AVX-NEXT: [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]] +; AVX-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]]) +; AVX-NEXT: [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00 +; AVX-NEXT: [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2 +; AVX-NEXT: [[TMP10:%.*]] = load double, ptr [[Z28]], align 8 +; AVX-NEXT: [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2 +; AVX-NEXT: [[TMP11:%.*]] = load double, ptr [[Z30]], align 8 +; AVX-NEXT: [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]] +; AVX-NEXT: [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]]) +; AVX-NEXT: [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]] +; AVX-NEXT: [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]]) +; AVX-NEXT: [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]]) +; AVX-NEXT: [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]]) +; AVX-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]]) +; AVX-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]]) +; AVX-NEXT: [[FNEG:%.*]] = fneg double [[TMP6]] +; AVX-NEXT: [[TMP18:%.*]] = fneg double [[TMP8]] +; AVX-NEXT: [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]] +; AVX-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]]) +; AVX-NEXT: [[NEG78:%.*]] = fneg double [[TMP11]] +; AVX-NEXT: [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]]) +; AVX-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]]) +; AVX-NEXT: [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1 +; AVX-NEXT: [[TMP22:%.*]] = load double, ptr [[RAD]], align 8 +; AVX-NEXT: [[NEG82:%.*]] = fneg double [[TMP22]] +; AVX-NEXT: [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]]) +; AVX-NEXT: [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00 +; AVX-NEXT: [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]] +; AVX-NEXT: [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]]) +; AVX-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00 +; AVX-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] +; AVX: if.end: +; AVX-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) +; AVX-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] +; AVX-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 +; AVX-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0 +; AVX-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 +; AVX-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] +; AVX-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 +; AVX-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[MUL88]], i32 1 +; AVX-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] +; AVX-NEXT: [[TMP34:%.*]] = fcmp olt <2 x double> [[TMP33]], +; AVX-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP34]], i32 0 +; AVX-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP34]], i32 1 +; AVX-NEXT: [[OR_COND:%.*]] = select i1 [[TMP36]], i1 [[TMP35]], i1 false +; AVX-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] +; AVX: lor.lhs.false: +; AVX-NEXT: [[TMP37:%.*]] = fcmp ule <2 x double> [[TMP33]], +; AVX-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0 +; AVX-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1 +; AVX-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP39]], i1 true, i1 [[TMP38]] +; AVX-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 +; AVX-NEXT: br label [[CLEANUP]] +; AVX: cleanup: +; AVX-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ] +; AVX-NEXT: ret i32 [[RETVAL_0]] ; entry: %dir = getelementptr inbounds %struct.ray, ptr %ray, i64 0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll @@ -14,18 +14,23 @@ ; CHECK-NEXT: ret void ; CHECK: if.else: ; CHECK-NEXT: [[M_NUMCONSTRAINTROWS4:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[NUB5:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO]], i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]] ; CHECK: land.lhs.true.i.1: ; CHECK-NEXT: br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]] ; CHECK: if.then7.1: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[INC_1:%.*]] = add nsw i32 0, 1 +; CHECK-NEXT: store i32 [[INC_1]], i32* [[M_NUMCONSTRAINTROWS4]], align 4 +; CHECK-NEXT: [[DEC_1:%.*]] = add nsw i32 6, -1 +; CHECK-NEXT: store i32 [[DEC_1]], i32* [[NUB5]], align 4 ; CHECK-NEXT: br label [[FOR_INC_1]] ; CHECK: for.inc.1: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ , [[IF_THEN7_1]] ], [ , [[LAND_LHS_TRUE_I_1]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[DEC_1]], [[IF_THEN7_1]] ], [ 6, [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[INC_1]], [[IF_THEN7_1]] ], [ 0, [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[INC_2:%.*]] = add nsw i32 [[TMP1]], 1 +; CHECK-NEXT: store i32 [[INC_2]], i32* [[M_NUMCONSTRAINTROWS4]], align 4 +; CHECK-NEXT: [[DEC_2:%.*]] = add nsw i32 [[TMP0]], -1 +; CHECK-NEXT: store i32 [[DEC_2]], i32* [[NUB5]], align 4 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll @@ -27,24 +27,25 @@ ; CHECK: land.rhs.lr.ph: ; CHECK-NEXT: unreachable ; CHECK: if.end98: +; CHECK-NEXT: [[FROM299:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE167]], label [[IF_THEN103:%.*]] ; CHECK: if.then103: -; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 ; CHECK-NEXT: [[DOTSUB100:%.*]] = select i1 undef, i32 250, i32 undef ; CHECK-NEXT: [[MUL114:%.*]] = shl nsw i32 [[DOTSUB100]], 2 +; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 ; CHECK-NEXT: [[COND125:%.*]] = select i1 undef, i32 undef, i32 [[MUL114]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[COND125]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[DOTSUB100]], i32 1 ; CHECK-NEXT: br label [[FOR_COND_I:%.*]] ; CHECK: for.cond.i: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ undef, [[LAND_RHS_I874:%.*]] ], [ [[TMP1]], [[IF_THEN103]] ] +; CHECK-NEXT: [[ROW_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874:%.*]] ], [ [[DOTSUB100]], [[IF_THEN103]] ] +; CHECK-NEXT: [[COL_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874]] ], [ [[COND125]], [[IF_THEN103]] ] ; CHECK-NEXT: br i1 undef, label [[LAND_RHS_I874]], label [[FOR_END_I:%.*]] ; CHECK: land.rhs.i874: ; CHECK-NEXT: br i1 undef, label [[FOR_COND_I]], label [[FOR_END_I]] ; CHECK: for.end.i: ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]] ; CHECK: if.then.i: -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], undef +; CHECK-NEXT: [[ADD14_I:%.*]] = add nsw i32 [[ROW_0_I]], undef +; CHECK-NEXT: [[ADD15_I:%.*]] = add nsw i32 [[COL_0_I]], undef ; CHECK-NEXT: br label [[EXTEND_BW_EXIT:%.*]] ; CHECK: if.end.i: ; CHECK-NEXT: [[ADD16_I:%.*]] = add i32 [[COND125]], [[DOTSUB100]] @@ -65,12 +66,14 @@ ; CHECK: while.end275.i: ; CHECK-NEXT: br label [[EXTEND_BW_EXIT]] ; CHECK: extend_bw.exit: -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ [[TMP3]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: [[ADD14_I1262:%.*]] = phi i32 [ [[ADD14_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: [[ADD15_I1261:%.*]] = phi i32 [ [[ADD15_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] ; CHECK-NEXT: br i1 false, label [[IF_THEN157:%.*]], label [[LAND_LHS_TRUE167]] ; CHECK: if.then157: -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[FROM1115]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[ADD158:%.*]] = add nsw i32 [[ADD14_I1262]], 1 +; CHECK-NEXT: store i32 [[ADD158]], i32* [[FROM299]], align 4 +; CHECK-NEXT: [[ADD160:%.*]] = add nsw i32 [[ADD15_I1261]], 1 +; CHECK-NEXT: store i32 [[ADD160]], i32* [[FROM1115]], align 4 ; CHECK-NEXT: br label [[LAND_LHS_TRUE167]] ; CHECK: land.lhs.true167: ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -18,18 +18,13 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[MUL11]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x double> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[G]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP6]], <4 x double>* [[TMP7]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll @@ -7,17 +7,23 @@ define i32 @foo(i32* nocapture %A, i32 %n, i32 %m) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: [[EXTERNALUSE1:%.*]] = add nsw i32 [[TMP6]], [[M:%.*]] -; CHECK-NEXT: [[EXTERNALUSE2:%.*]] = mul nsw i32 [[TMP6]], [[M]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[N:%.*]], 5 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], 9 +; CHECK-NEXT: store i32 [[ADD]], i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i32 [[N]], 9 +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[MUL1]], 9 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 +; CHECK-NEXT: store i32 [[ADD2]], i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[MUL4:%.*]] = shl i32 [[N]], 3 +; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[MUL4]], 9 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; CHECK-NEXT: store i32 [[ADD5]], i32* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[MUL7:%.*]] = mul nsw i32 [[N]], 10 +; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[MUL7]], 9 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[EXTERNALUSE1:%.*]] = add nsw i32 [[ADD]], [[M:%.*]] +; CHECK-NEXT: [[EXTERNALUSE2:%.*]] = mul nsw i32 [[ADD]], [[M]] ; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[EXTERNALUSE1]], [[EXTERNALUSE2]] ; CHECK-NEXT: ret i32 [[ADD10]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -12,35 +12,37 @@ ; CHECK-NEXT: br i1 [[TOBOOL_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[C_022:%.*]] = phi i32* [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32*> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[B_021:%.*]] = phi i32* [ [[B_021_BE:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[A_020:%.*]] = phi i32* [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint i32* [[C_022]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [ +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint i32* [[C_022]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 1 +; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[B_021]], i64 1 +; CHECK-NEXT: switch i32 [[TMP2]], label [[WHILE_BODY_BACKEDGE]] [ ; CHECK-NEXT: i32 2, label [[SW_BB:%.*]] ; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint i32* [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[TMP9]], align 4 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[B_021]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint i32* [[INCDEC_PTR2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 2 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: sw.bb6: +; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[INCDEC_PTR9:%.*]] = getelementptr inbounds i32, i32* [[B_021]], i64 2 +; CHECK-NEXT: store i32 [[TMP6]], i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: while.body.backedge: ; CHECK-NEXT: [[C_022_BE]] = phi i32* [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] -; CHECK-NEXT: [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ] +; CHECK-NEXT: [[B_021_BE]] = phi i32* [ [[INCDEC_PTR2]], [[WHILE_BODY]] ], [ [[INCDEC_PTR9]], [[SW_BB6]] ], [ [[INCDEC_PTR3]], [[SW_BB]] ] +; CHECK-NEXT: [[A_020_BE]] = phi i32* [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR7]], [[SW_BB6]] ], [ [[INCDEC_PTR4]], [[SW_BB]] ] ; CHECK-NEXT: br label [[WHILE_BODY]] ; CHECK: while.end: ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -447,26 +447,28 @@ ; ALL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ] ; ALL-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2 ; ALL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]] -; ALL-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; ALL-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 -; ALL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] -; ALL-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4 -; ALL-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2 -; ALL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]] -; ALL-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4 -; ALL-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3 -; ALL-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]] -; ALL-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; ALL-NEXT: [[TMP1:%.*]] = or i64 [[TMP0]], 1 +; ALL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]] +; ALL-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 2 +; ALL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] +; ALL-NEXT: [[TMP3:%.*]] = or i64 [[TMP0]], 3 +; ALL-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]] +; ALL-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; ALL-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 +; ALL-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0 +; ALL-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1 +; ALL-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2 +; ALL-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3 ; ALL-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]] ; ALL: for.body16.lr.ph: ; ALL-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]] -; ALL-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4 +; ALL-NEXT: [[TMP10:%.*]] = load float, float* [[ADD_PTR]], align 4 ; ALL-NEXT: br label [[FOR_BODY16:%.*]] ; ALL: for.cond.cleanup15: -; ALL-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP8]], [[FOR_BODY]] ], [ [[OP_RDX:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP24:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP12:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP6]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ] ; ALL-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4 ; ALL-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4 ; ALL-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4 @@ -475,26 +477,27 @@ ; ALL-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6 ; ALL-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; ALL: for.body16: -; ALL-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ] ; ALL-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ] -; ALL-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000 -; ALL-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000 -; ALL-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]] -; ALL-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP23:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[TMP12]] = extractelement <4 x float> [[TMP11]], i32 0 +; ALL-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1 +; ALL-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0 +; ALL-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i32 1 +; ALL-NEXT: [[TMP16:%.*]] = fmul fast <2 x float> [[TMP15]], +; ALL-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 +; ALL-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 +; ALL-NEXT: [[SUB92:%.*]] = fadd fast float [[TMP17]], [[TMP18]] +; ALL-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP10]] ; ALL-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000 -; ALL-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000 -; ALL-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000 -; ALL-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000 -; ALL-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000 -; ALL-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]] -; ALL-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]] -; ALL-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]] -; ALL-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]] +; ALL-NEXT: [[TMP19:%.*]] = fmul fast <4 x float> [[TMP11]], +; ALL-NEXT: [[TMP20:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP19]]) +; ALL-NEXT: [[OP_RDX]] = fadd fast float [[TMP20]], [[MUL20]] ; ALL-NEXT: [[INC]] = add nuw i32 [[J_098]], 1 ; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]] +; ALL-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[SUB19]], i32 0 +; ALL-NEXT: [[TMP22:%.*]] = shufflevector <4 x float> [[TMP21]], <4 x float> [[TMP11]], <4 x i32> +; ALL-NEXT: [[TMP23]] = insertelement <4 x float> [[TMP22]], float [[OP_RDX]], i32 2 +; ALL-NEXT: [[TMP24]] = extractelement <4 x float> [[TMP11]], i32 2 ; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -482,37 +482,23 @@ define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { -; SSE-LABEL: @ExtractIdxNotConstantInt1( -; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef -; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] -; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] -; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; SSE-NEXT: ret i1 [[CMP_I185]] -; -; AVX-LABEL: @ExtractIdxNotConstantInt1( -; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef -; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] -; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] -; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] -; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 -; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] -; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 -; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] -; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] -; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; AVX-NEXT: ret i1 [[CMP_I185]] +; CHECK-LABEL: @ExtractIdxNotConstantInt1( +; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef +; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] +; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP_I185]] ; %vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef %sub14.i167 = fsub float undef, %vecext.i291.i166 @@ -530,37 +516,23 @@ define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { -; SSE-LABEL: @ExtractIdxNotConstantInt2( -; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 -; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] -; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] -; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; SSE-NEXT: ret i1 [[CMP_I185]] -; -; AVX-LABEL: @ExtractIdxNotConstantInt2( -; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 -; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] -; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] -; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] -; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 -; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] -; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 -; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] -; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] -; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; AVX-NEXT: ret i1 [[CMP_I185]] +; CHECK-LABEL: @ExtractIdxNotConstantInt2( +; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 +; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] +; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP_I185]] ; %vecext.i291.i166 = extractelement <4 x float> %vec, i64 1 %sub14.i167 = fsub float undef, %vecext.i291.i166 @@ -578,37 +550,23 @@ define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { -; SSE-LABEL: @foo( -; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 -; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] -; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] -; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; SSE-NEXT: ret i1 [[CMP_I185]] -; -; AVX-LABEL: @foo( -; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 -; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] -; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] -; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 -; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 -; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] -; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 -; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] -; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] -; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; AVX-NEXT: ret i1 [[CMP_I185]] +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 +; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] +; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP_I185]] ; %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0 %sub14.i167 = fsub float undef, %vecext.i291.i166 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=AVX target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -13,21 +13,34 @@ ; zero-extend the roots back to their original sizes. ; define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { -; CHECK-LABEL: @PR31243_zext( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1 -; CHECK-NEXT: [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1 -; CHECK-NEXT: [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]] -; CHECK-NEXT: ret i8 [[TMP_8]] +; SSE-LABEL: @PR31243_zext( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1 +; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1 +; SSE-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i64 +; SSE-NEXT: [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]] +; SSE-NEXT: [[TMP3:%.*]] = zext i8 [[TMP1]] to i64 +; SSE-NEXT: [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]] +; SSE-NEXT: [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1 +; SSE-NEXT: [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1 +; SSE-NEXT: [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]] +; SSE-NEXT: ret i8 [[TMP_8]] +; +; AVX-LABEL: @PR31243_zext( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 +; AVX-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], +; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 +; AVX-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 +; AVX-NEXT: [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]] +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 +; AVX-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 +; AVX-NEXT: [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]] +; AVX-NEXT: [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1 +; AVX-NEXT: [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1 +; AVX-NEXT: [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]] +; AVX-NEXT: ret i8 [[TMP_8]] ; entry: %tmp_0 = zext i8 %v0 to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -67,25 +67,32 @@ ; SSE-LABEL: @pr35497( ; SSE-NEXT: entry: ; SSE-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 +; SSE-NEXT: [[AND:%.*]] = shl i64 [[TMP0]], 2 +; SSE-NEXT: [[SHL:%.*]] = and i64 [[AND]], 20 ; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef ; SSE-NEXT: store i64 [[ADD]], i64* undef, align 1 +; SSE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 +; SSE-NEXT: [[AND_1:%.*]] = shl i64 undef, 2 +; SSE-NEXT: [[SHL_1:%.*]] = and i64 [[AND_1]], 20 +; SSE-NEXT: [[SHR_1:%.*]] = lshr i64 undef, 6 +; SSE-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]] ; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 +; SSE-NEXT: [[SHR_2:%.*]] = lshr i64 undef, 6 +; SSE-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]] +; SSE-NEXT: [[AND_4:%.*]] = shl i64 [[ADD]], 2 +; SSE-NEXT: [[SHL_4:%.*]] = and i64 [[AND_4]], 20 +; SSE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 +; SSE-NEXT: store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1 +; SSE-NEXT: [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2 +; SSE-NEXT: [[SHL_5:%.*]] = and i64 [[AND_5]], 20 +; SSE-NEXT: [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6 +; SSE-NEXT: [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]] +; SSE-NEXT: store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1 ; SSE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 -; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], -; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1 -; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], -; SSE-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], -; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], -; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] -; SSE-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* -; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 +; SSE-NEXT: store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1 +; SSE-NEXT: [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6 +; SSE-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]] +; SSE-NEXT: store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @pr35497( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -2,8 +2,8 @@ ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX2 define void @store_i32(i32* nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: @store_i32( @@ -143,19 +143,58 @@ ; ; AVX-LABEL: @store_i64( ; AVX-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] -; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer -; AVX-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP6]], [[SHUFFLE]] -; AVX-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> -; AVX-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], -; AVX-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> -; AVX-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* -; AVX-NEXT: store <4 x i64> [[TMP13]], <4 x i64>* [[TMP14]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] +; AVX-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 +; AVX-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; AVX-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 +; AVX-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 +; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 +; AVX-NEXT: store i64 [[TMP11]], i64* [[TMP0]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] +; AVX-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 +; AVX-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 +; AVX-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 +; AVX-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 +; AVX-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 +; AVX-NEXT: store i64 [[TMP19]], i64* [[TMP12]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 +; AVX-NEXT: [[TMP21:%.*]] = load i64, i64* [[TMP20]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] +; AVX-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 +; AVX-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; AVX-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 +; AVX-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 +; AVX-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 +; AVX-NEXT: store i64 [[TMP27]], i64* [[TMP20]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 3 +; AVX-NEXT: [[TMP29:%.*]] = load i64, i64* [[TMP28]], align 8, !tbaa [[TBAA5]] +; AVX-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] +; AVX-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 +; AVX-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 +; AVX-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 +; AVX-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 +; AVX-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 +; AVX-NEXT: store i64 [[TMP35]], i64* [[TMP28]], align 8, !tbaa [[TBAA5]] ; AVX-NEXT: ret void +; +; AVX2-LABEL: @store_i64( +; AVX2-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 +; AVX2-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP0:%.*]] to <4 x i64>* +; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8, !tbaa [[TBAA5:![0-9]+]] +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 +; AVX2-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer +; AVX2-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP6]], [[SHUFFLE]] +; AVX2-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], +; AVX2-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], +; AVX2-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], +; AVX2-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> +; AVX2-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP0]] to <4 x i64>* +; AVX2-NEXT: store <4 x i64> [[TMP13]], <4 x i64>* [[TMP14]], align 8, !tbaa [[TBAA5]] +; AVX2-NEXT: ret void ; %4 = zext i32 %1 to i64 %5 = load i64, i64* %0, align 8, !tbaa !7 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -199,29 +199,33 @@ ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 ; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 -; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 -; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], -; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], +; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 +; AVX-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 +; AVX-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], +; AVX-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( @@ -402,6 +406,7 @@ ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 @@ -410,21 +415,24 @@ ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 -; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1 +; AVX-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2 +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -199,29 +199,33 @@ ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 ; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 -; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 -; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], -; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], +; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 +; AVX-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 +; AVX-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], +; AVX-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( @@ -402,6 +406,7 @@ ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 @@ -410,21 +415,24 @@ ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 -; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1 +; AVX-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2 +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -227,15 +227,37 @@ ; logic...or a wide reduction? define i1 @logical_and_icmp_clamp(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; SSE-LABEL: @logical_and_icmp_clamp( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) +; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP1]] +; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -260,17 +282,40 @@ } define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: call void @use1(i1 [[TMP2]]) -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 [[TMP7]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; SSE-LABEL: @logical_and_icmp_clamp_extra_use_cmp( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; SSE-NEXT: call void @use1(i1 [[TMP2]]) +; SSE-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]]) +; SSE-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]] +; SSE-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 [[TMP7]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp_extra_use_cmp( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: call void @use1(i1 [[C2]]) +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -296,21 +341,44 @@ } define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_select( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false -; CHECK-NEXT: call void @use1(i1 [[S2]]) -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 [[S2]], i1 false -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP7]], i1 [[OP_RDX]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX1]] +; SSE-LABEL: @logical_and_icmp_clamp_extra_use_select( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; SSE-NEXT: [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false +; SSE-NEXT: call void @use1(i1 [[S2]]) +; SSE-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]] +; SSE-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 [[S2]], i1 false +; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP7]], i1 [[OP_RDX]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX1]] +; +; AVX-LABEL: @logical_and_icmp_clamp_extra_use_select( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: call void @use1(i1 [[S2]]) +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -386,38 +454,20 @@ } define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { -; SSE-LABEL: @logical_and_icmp_clamp_partial( -; SSE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 -; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[TMP5]], -; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 -; SSE-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[X]], -; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]]) -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 -; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 -; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false -; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false -; SSE-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 [[OP_RDX1]], i1 false -; SSE-NEXT: ret i1 [[OP_RDX2]] -; -; AVX-LABEL: @logical_and_icmp_clamp_partial( -; AVX-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 -; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 -; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP3]], 42 -; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP2]], 42 -; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], -; AVX-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] -; AVX-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false -; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false -; AVX-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP6]], i1 [[OP_RDX1]], i1 false -; AVX-NEXT: ret i1 [[OP_RDX2]] +; CHECK-LABEL: @logical_and_icmp_clamp_partial( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 +; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP3]], 42 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP2]], 42 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false +; CHECK-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP6]], i1 [[OP_RDX1]], i1 false +; CHECK-NEXT: ret i1 [[OP_RDX2]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -442,17 +492,39 @@ } define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; SSE-LABEL: @logical_and_icmp_clamp_pred_diff( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; SSE-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]] +; SSE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp_pred_diff( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[C3:%.*]] = icmp ult i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -96,17 +96,16 @@ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[TMP10]], [[TMP9]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 -; CHECK-NEXT: [[NOT_OR_COND9:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP11:%.*]] = fcmp ule <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: [[NOT_OR_COND9:%.*]] = or i1 [[TMP12]], [[TMP13]] ; CHECK-NEXT: ret i1 [[NOT_OR_COND9]] ; CHECK: cleanup: ; CHECK-NEXT: ret i1 false diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -189,13 +189,14 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 +; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 +; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 +; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -222,21 +223,22 @@ define void @addsub1(i32* noalias %dst, i32* noalias %src) { ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 2 -; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 +; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 +; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP6]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP7]], -3 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ;