diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1143,6 +1143,12 @@ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index = -1) const; + /// \return The cost of the constant buildvector sequence. + InstructionCost + getConstBuildVectorInstrCost(VectorType *VecTy, unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind) const; + /// \return The cost of replication shuffle of \p VF elements typed \p EltTy /// \p ReplicationFactor times. /// @@ -1706,6 +1712,10 @@ virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) = 0; + virtual InstructionCost + getConstBuildVectorInstrCost(VectorType *VecTy, unsigned UserOpcode, + unsigned Idx, TTI::TargetCostKind CostKind) = 0; + virtual InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, @@ -2248,6 +2258,12 @@ return Impl.getVectorInstrCost(Opcode, Val, Index); } InstructionCost + getConstBuildVectorInstrCost(VectorType *VecTy, unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind) override { + return Impl.getConstBuildVectorInstrCost(VecTy, UserOpcode, Idx, CostKind); + } + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -566,6 +566,13 @@ return 1; } + InstructionCost + getConstBuildVectorInstrCost(VectorType *VecTy, unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind) const { + return TTI::TCC_Free; + } + unsigned getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) { diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1143,6 +1143,13 @@ return LT.first; } + InstructionCost getConstBuildVectorInstrCost(VectorType *VecTy, + unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind) { + return TTI::TCC_Free; + } + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -852,6 +852,14 @@ return Cost; } +InstructionCost TargetTransformInfo::getConstBuildVectorInstrCost( + VectorType *VecTy, unsigned UserOpcode, unsigned Idx, + TTI::TargetCostKind CostKind) const { + InstructionCost Cost = + TTIImpl->getConstBuildVectorInstrCost(VecTy, UserOpcode, Idx, CostKind); + return Cost; +} + InstructionCost TargetTransformInfo::getReplicationShuffleCost( Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -112,6 +112,10 @@ if (BitSize == 0) return TTI::TCC_Free; + // TODO: implement for throughput cost. + if (CostKind == TTI::TCK_RecipThroughput) + return TTI::TCC_Free; + unsigned ImmIdx = ~0U; switch (Opcode) { default: diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -386,6 +386,10 @@ const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst) { + // TODO: implement for throughput cost. + if (CostKind == TTI::TCK_RecipThroughput) + return TTI::TCC_Free; + // Division by a constant can be turned into multiplication, but only if we // know it's constant. So it's not so much that the immediate is cheap (it's // not), but that the alternative is worse. diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -169,6 +169,10 @@ InstructionCost PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { + // TODO: implement for throughput cost. + if (CostKind == TTI::TCK_RecipThroughput) + return TTI::TCC_Free; + if (DisablePPCConstHoist) return BaseT::getIntImmCost(Imm, Ty, CostKind); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -105,6 +105,11 @@ Optional FMF, TTI::TargetCostKind CostKind); + InstructionCost getConstBuildVectorInstrCost(VectorType *VecTy, + unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind); + bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) { if (!ST->hasVInstructions()) return false; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -355,6 +355,32 @@ return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); } +InstructionCost +RISCVTTIImpl::getConstBuildVectorInstrCost(VectorType *VecTy, + unsigned UserOpcode, unsigned Idx, + TTI::TargetCostKind CostKind) { + InstructionCost VecCost = 0; + switch (UserOpcode) { + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + if (Idx == 1) + return TTI::TCC_Free; + LLVM_FALLTHROUGH; + default: { + APInt PseudoAddr = APInt::getAllOnes(DL.getPointerSizeInBits()); + // Add a cost of address load + the cost of the vector load. + VecCost = + RISCVMatInt::getIntMatCost(PseudoAddr, DL.getPointerSizeInBits(), + getST()->getFeatureBits()) + + getMemoryOpCost(Instruction::Load, VecTy, DL.getABITypeAlign(VecTy), + /*AddressSpace=*/0, CostKind); + break; + } + } + return VecCost; +} + void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) { diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -110,6 +110,11 @@ // here, so that constant hoisting will ignore this constant. if (BitSize == 0) return TTI::TCC_Free; + + // TODO: implement for throughput cost. + if (CostKind == TTI::TCK_RecipThroughput) + return TTI::TCC_Free; + // No cost model for operations on integers larger than 64 bit implemented yet. if (BitSize > 64) return TTI::TCC_Free; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -148,6 +148,10 @@ const Instruction *I = nullptr); InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + InstructionCost getConstBuildVectorInstrCost(VectorType *VecTy, + unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind); InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3775,6 +3775,45 @@ return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } +InstructionCost +X86TTIImpl::getConstBuildVectorInstrCost(VectorType *VecTy, unsigned UserOpcode, + unsigned Idx, + TTI::TargetCostKind CostKind) { + assert(CostKind == TTI::TCK_RecipThroughput && + "Expected only TTI::TCK_RecipThroughput currently."); + Type *ScalarTy = VecTy->getElementType(); + TypeSize Sz = DL.getTypeSizeInBits(ScalarTy); + if (CostKind == TTI::TCK_RecipThroughput && + (Sz > 64 || (Sz > 32 && ST->is32Bit()) || (Sz > 16 && ST->is16Bit()))) + return X86TTIImpl::getMemoryOpCost(Instruction::Load, VecTy, + DL.getABITypeAlign(VecTy), + /*AddressSpace=*/0, CostKind); + + // Even if the const value is read from the memory, for many instructions it + // is free, since they have a data-from-memory form of the vector + // instructions. + switch (UserOpcode) { + case Instruction::Mul: + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FCmp: + if (Idx == 1) + return TTI::TCC_Free; + break; + default: + break; + } + return getMemoryOpCost(Instruction::Load, VecTy, DL.getABITypeAlign(VecTy), + /*AddressSpace=*/0, CostKind); +} + InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, @@ -4876,6 +4915,14 @@ if (BitSize == 0) return TTI::TCC_Free; + uint64_t Val = Imm.getLimitedValue(); + if (CostKind == TTI::TCK_RecipThroughput && + (Imm.getActiveBits() > 64 || (!isInt<32>(Val) && ST->is32Bit()) || + (!isInt<16>(Val) && ST->is16Bit()))) + return X86TTIImpl::getMemoryOpCost(Instruction::Load, Ty, + DL.getABITypeAlign(Ty), + /*AddressSpace=*/0, CostKind); + unsigned ImmIdx = ~0U; switch (Opcode) { default: @@ -4952,6 +4999,16 @@ break; } + // If CostKind is throughput and ImmIdx == Idx, the cost is free. Otherwise, + // load the const from memory. + if (CostKind == TTI::TCK_RecipThroughput) { + if (Idx == ImmIdx) + return TTI::TCC_Free; + return X86TTIImpl::getMemoryOpCost(Instruction::Load, Ty, + DL.getABITypeAlign(Ty), + /*AddressSpace=*/0, CostKind); + } + if (Idx == ImmIdx) { int NumConstants = divideCeil(BitSize, 64); InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5656,6 +5656,37 @@ return I->getOpcode() == AltOp->getOpcode(); } +static InstructionCost getCostForConstants(TargetTransformInfo &TTI, + const DataLayout &DL, + ArrayRef VL, + unsigned UserOpcode, + unsigned UserIdx) { + Type *ScalarTy = VL.front()->getType(); + unsigned VF = VL.size(); + auto *VecTy = FixedVectorType::get(ScalarTy, VF); + InstructionCost VecCost = TTI.getConstBuildVectorInstrCost( + VecTy, UserOpcode, UserIdx, TTI::TCK_RecipThroughput); + InstructionCost ScalarCost = 0; + if (!ScalarTy->isIntegerTy()) { + ScalarCost += + TTI.getMemoryOpCost(Instruction::Load, ScalarTy, + DL.getABITypeAlign(ScalarTy), + /*AddressSpace=*/0, TTI::TCK_RecipThroughput) * + VF; + } else { + // Be conservative if the data type is larger than the largest legal int + // type. + for (Value *V : VL) { + if (isa(V)) + continue; + auto *CI = cast(V); + ScalarCost += TTI.getIntImmCostInst(UserOpcode, UserIdx, CI->getValue(), + ScalarTy, TTI::TCK_RecipThroughput); + } + } + return VecCost - ScalarCost; +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; @@ -5761,8 +5792,29 @@ } }; if (E->State == TreeEntry::NeedToGather) { - if (allConstant(VL)) - return 0; + if (allConstant(VL)) { + // For reduced constants no need to estimate the cost. + // FIXME: need to emit an accumulated constant val instead of building a + // graph. + if (E->UserTreeIndices.empty()) + return 0; + if (all_of(VL, [](Value *V) { + if (isa(V)) + return true; + auto *C = cast(V); + return C->isZeroValue(); + })) + return 0; + // TODO: improve opcode and idx for alternate opcodes. + unsigned UserOpcode = + E->UserTreeIndices.front().UserTE->isAltShuffle() + ? 0 + : E->UserTreeIndices.front().UserTE->getOpcode(); + unsigned UserIdx = E->UserTreeIndices.front().UserTE->isAltShuffle() + ? 0 + : E->UserTreeIndices.front().EdgeIdx; + return getCostForConstants(*TTI, *DL, VL, UserOpcode, UserIdx); + } if (isa(VL[0])) return InstructionCost::getInvalid(); SmallVector Mask; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -288,17 +288,16 @@ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i64 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i64 0 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[CMP4]], i1 false +; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i64 1 +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[TMP10]], i1 [[TMP9]], i1 false ; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i64 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i64 1 -; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[TMP12]], i1 true, i1 [[TMP11]] +; CHECK-NEXT: [[TMP11:%.*]] = fcmp ule <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1 +; CHECK-NEXT: [[OR_COND1:%.*]] = select i1 [[TMP13]], i1 true, i1 [[TMP12]] ; CHECK-NEXT: br label [[CLEANUP]] ; CHECK: cleanup: ; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[OR_COND1]], [[LOR_LHS_FALSE]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll b/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll --- a/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll +++ b/llvm/test/Transforms/PhaseOrdering/fast-basictest.ll @@ -84,11 +84,20 @@ ; TODO: This doesn't require 'nsz'. It should fold to ((x1 - x2) * 47.0) define float @test13_reassoc(float %X1, float %X2) { -; CHECK-LABEL: @test13_reassoc( -; CHECK-NEXT: [[B:%.*]] = fmul reassoc float [[X1:%.*]], 4.700000e+01 -; CHECK-NEXT: [[C:%.*]] = fmul reassoc float [[X2:%.*]], 4.700000e+01 -; CHECK-NEXT: [[TMP1:%.*]] = fsub reassoc float [[B]], [[C]] -; CHECK-NEXT: ret float [[TMP1]] +; REASSOC_AND_IC-LABEL: @test13_reassoc( +; REASSOC_AND_IC-NEXT: [[B:%.*]] = fmul reassoc float [[X1:%.*]], 4.700000e+01 +; REASSOC_AND_IC-NEXT: [[C:%.*]] = fmul reassoc float [[X2:%.*]], 4.700000e+01 +; REASSOC_AND_IC-NEXT: [[TMP1:%.*]] = fsub reassoc float [[B]], [[C]] +; REASSOC_AND_IC-NEXT: ret float [[TMP1]] +; +; O2-LABEL: @test13_reassoc( +; O2-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[X1:%.*]], i64 0 +; O2-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[X2:%.*]], i64 1 +; O2-NEXT: [[TMP3:%.*]] = fmul reassoc <2 x float> [[TMP2]], +; O2-NEXT: [[SHIFT:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; O2-NEXT: [[TMP4:%.*]] = fsub reassoc <2 x float> [[TMP3]], [[SHIFT]] +; O2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i64 0 +; O2-NEXT: ret float [[TMP5]] ; %B = fmul reassoc float %X1, 47. ; X1*47 %C = fmul reassoc float %X2, -47. ; X2*-47 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -666,19 +666,19 @@ define void @single_membound(double* %arg, double* %arg1, double %x) { ; CHECK-LABEL: @single_membound( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[ARG:%.*]], i64 1 -; CHECK-NEXT: store double [[TMP]], double* [[TMP9]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[ARG1:%.*]], i64 0 +; CHECK-NEXT: [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01 +; CHECK-NEXT: store double [[TMP]], double* [[TMP9]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = fsub double 1.000000e+00, [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 2 ; CHECK-NEXT: br label [[BB15:%.*]] ; CHECK: bb15: -; CHECK-NEXT: [[TMP16:%.*]] = fmul double [[TMP]], 2.000000e+01 -; CHECK-NEXT: store double [[TMP16]], double* [[TMP9]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = fmul double [[TMP13]], 3.000000e+01 -; CHECK-NEXT: store double [[TMP17]], double* [[TMP14]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP9]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -1275,28 +1275,27 @@ ; CHECK-NEXT: [[T19:%.*]] = load float*, float** [[ARG:%.*]], align 8 ; CHECK-NEXT: [[T20:%.*]] = load float, float* [[ARG_3:%.*]], align 4 ; CHECK-NEXT: [[T21:%.*]] = getelementptr inbounds float, float* [[ARG_2:%.*]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> , float [[T20]], i32 1 ; CHECK-NEXT: br i1 [[C:%.*]], label [[BB22:%.*]], label [[BB30:%.*]] ; CHECK: bb22: ; CHECK-NEXT: [[T23:%.*]] = fmul float [[T20]], 9.900000e+01 -; CHECK-NEXT: [[T24:%.*]] = fmul float [[T23]], 9.900000e+01 ; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds float, float* [[T19]], i64 2 -; CHECK-NEXT: [[T26:%.*]] = fmul float [[T23]], 1.000000e+01 -; CHECK-NEXT: store float [[T26]], float* [[T25]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[T23]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: store float [[TMP4]], float* [[T25]], align 4 ; CHECK-NEXT: [[T27:%.*]] = load float, float* [[T21]], align 8 -; CHECK-NEXT: [[T28:%.*]] = fadd float [[T24]], 2.000000e+01 -; CHECK-NEXT: [[T29:%.*]] = fadd float [[T26]], 2.000000e+01 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], ; CHECK-NEXT: br label [[BB30]] ; CHECK: bb30: -; CHECK-NEXT: [[T31:%.*]] = phi float [ [[T28]], [[BB22]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[T32:%.*]] = phi float [ [[T29]], [[BB22]] ], [ [[T20]], [[ENTRY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[BB36:%.*]] ; CHECK: bb36: -; CHECK-NEXT: [[T37:%.*]] = fmul float [[T31]], 3.000000e+00 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], ; CHECK-NEXT: [[T38:%.*]] = getelementptr inbounds float, float* [[ARG_3]], i64 0 -; CHECK-NEXT: store float [[T37]], float* [[T38]], align 4 -; CHECK-NEXT: [[T39:%.*]] = fmul float [[T32]], 3.000000e+00 -; CHECK-NEXT: [[T40:%.*]] = getelementptr inbounds float, float* [[ARG_3]], i64 1 -; CHECK-NEXT: store float [[T39]], float* [[T40]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[T38]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP7]], <2 x float>* [[TMP8]], align 4 ; CHECK-NEXT: br label [[BB41:%.*]] ; CHECK: bb41: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll @@ -12,11 +12,13 @@ define void @foo(i64* nocapture writeonly %da) { ; CHECK-128-LABEL: @foo( ; CHECK-128-NEXT: entry: -; CHECK-128-NEXT: [[TMP0:%.*]] = bitcast i64* [[DA:%.*]] to <2 x i64>* -; CHECK-128-NEXT: store <2 x i64> , <2 x i64>* [[TMP0]], align 8 +; CHECK-128-NEXT: store i64 0, i64* [[DA:%.*]], align 8 +; CHECK-128-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 1 +; CHECK-128-NEXT: store i64 1, i64* [[ARRAYIDX1]], align 8 ; CHECK-128-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 2 -; CHECK-128-NEXT: [[TMP1:%.*]] = bitcast i64* [[ARRAYIDX2]] to <2 x i64>* -; CHECK-128-NEXT: store <2 x i64> , <2 x i64>* [[TMP1]], align 8 +; CHECK-128-NEXT: store i64 2, i64* [[ARRAYIDX2]], align 8 +; CHECK-128-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 3 +; CHECK-128-NEXT: store i64 3, i64* [[ARRAYIDX3]], align 8 ; CHECK-128-NEXT: ret void ; ; CHECK-256-LABEL: @foo( @@ -45,8 +47,9 @@ define void @foo8(i8* nocapture writeonly %da) { ; CHECK-LABEL: @foo8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[DA:%.*]] to <2 x i8>* -; CHECK-NEXT: store <2 x i8> , <2 x i8>* [[TMP0]], align 8 +; CHECK-NEXT: store i8 0, i8* [[DA:%.*]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DA]], i8 1 +; CHECK-NEXT: store i8 1, i8* [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[DA]], i8 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll @@ -23,54 +23,52 @@ ; CHECK-NEXT: [[D1_DATA_046:%.*]] = phi i8* [ [[TMP3]], [[ENTRY:%.*]] ], [ [[ADD_PTR23_1:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[Y_045:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[CONV]], -128 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 -; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP5]] to i32 -; CHECK-NEXT: [[SUB4:%.*]] = add nsw i32 [[CONV3]], -128 -; CHECK-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[SUB]], -1 -; CHECK-NEXT: [[SUB7:%.*]] = sub nsw i32 128, [[CONV]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP5]], i32 [[SUB]], i32 [[SUB7]] -; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[SUB4]], -1 -; CHECK-NEXT: [[SUB12:%.*]] = sub nsw i32 128, [[CONV3]] -; CHECK-NEXT: [[COND14:%.*]] = select i1 [[CMP8]], i32 [[SUB4]], i32 [[SUB12]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[COND14]], [[COND]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i8> poison, i8 [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i8> [[TMP6]], i8 [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext <2 x i8> [[TMP7]] to <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <2 x i32> , [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP10]], <2 x i32> [[TMP9]], <2 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[IDX_NEG:%.*]] = sub nsw i32 0, [[ADD]] ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[IDX_NEG]] -; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ADD_PTR]], align 1 -; CHECK-NEXT: [[CONV15:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ADD_PTR]], align 1 +; CHECK-NEXT: [[CONV15:%.*]] = zext i8 [[TMP15]] to i32 ; CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[CONV15]], [[INTENSITY:%.*]] ; CHECK-NEXT: [[CONV17:%.*]] = trunc i32 [[ADD16]] to i8 ; CHECK-NEXT: store i8 [[CONV17]], i8* [[ADD_PTR]], align 1 ; CHECK-NEXT: [[ADD_PTR18:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[ADD]] -; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[ADD_PTR18]], align 1 -; CHECK-NEXT: [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP7]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ADD_PTR18]], align 1 +; CHECK-NEXT: [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP16]], 0 ; CHECK-NEXT: [[CONV21:%.*]] = zext i1 [[NOT_TOBOOL]] to i8 ; CHECK-NEXT: store i8 [[CONV21]], i8* [[ADD_PTR18]], align 1 ; CHECK-NEXT: [[ADD_PTR23:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP8]] to i32 -; CHECK-NEXT: [[SUB_1:%.*]] = add nsw i32 [[CONV_1]], -128 -; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 -; CHECK-NEXT: [[CONV3_1:%.*]] = zext i8 [[TMP9]] to i32 -; CHECK-NEXT: [[SUB4_1:%.*]] = add nsw i32 [[CONV3_1]], -128 -; CHECK-NEXT: [[CMP5_1:%.*]] = icmp sgt i32 [[SUB_1]], -1 -; CHECK-NEXT: [[SUB7_1:%.*]] = sub nsw i32 128, [[CONV_1]] -; CHECK-NEXT: [[COND_1:%.*]] = select i1 [[CMP5_1]], i32 [[SUB_1]], i32 [[SUB7_1]] -; CHECK-NEXT: [[CMP8_1:%.*]] = icmp sgt i32 [[SUB4_1]], -1 -; CHECK-NEXT: [[SUB12_1:%.*]] = sub nsw i32 128, [[CONV3_1]] -; CHECK-NEXT: [[COND14_1:%.*]] = select i1 [[CMP8_1]], i32 [[SUB4_1]], i32 [[SUB12_1]] -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[COND14_1]], [[COND_1]] +; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> poison, i8 [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP17]], i32 1 +; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = add nsw <2 x i32> [[TMP21]], +; CHECK-NEXT: [[TMP23:%.*]] = icmp sgt <2 x i32> [[TMP22]], +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw <2 x i32> , [[TMP21]] +; CHECK-NEXT: [[TMP25:%.*]] = select <2 x i1> [[TMP23]], <2 x i32> [[TMP22]], <2 x i32> [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] ; CHECK-NEXT: [[IDX_NEG_1:%.*]] = sub nsw i32 0, [[ADD_1]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[IDX_NEG_1]] -; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[CONV15_1:%.*]] = zext i8 [[TMP10]] to i32 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[CONV15_1:%.*]] = zext i8 [[TMP28]] to i32 ; CHECK-NEXT: [[ADD16_1:%.*]] = add nsw i32 [[CONV15_1]], [[INTENSITY]] ; CHECK-NEXT: [[CONV17_1:%.*]] = trunc i32 [[ADD16_1]] to i8 ; CHECK-NEXT: store i8 [[CONV17_1]], i8* [[ADD_PTR_1]], align 1 ; CHECK-NEXT: [[ADD_PTR18_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[ADD_1]] -; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1 -; CHECK-NEXT: [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP11]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1 +; CHECK-NEXT: [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP29]], 0 ; CHECK-NEXT: [[CONV21_1:%.*]] = zext i1 [[NOT_TOBOOL_1]] to i8 ; CHECK-NEXT: store i8 [[CONV21_1]], i8* [[ADD_PTR18_1]], align 1 ; CHECK-NEXT: [[ADD_PTR23_1]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[TMP1]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @fadd_fsub_v8f32( @@ -91,28 +91,9 @@ } define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { -; SSE-LABEL: @fmul_fdiv_v4f32_const( -; SSE-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; SSE-NEXT: ret <4 x float> [[TMP1]] -; -; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[A2]], i64 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i64 3 -; SLM-NEXT: ret <4 x float> [[R3]] -; -; AVX-LABEL: @fmul_fdiv_v4f32_const( -; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; AVX-NEXT: ret <4 x float> [[TMP1]] -; -; AVX512-LABEL: @fmul_fdiv_v4f32_const( -; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; AVX512-NEXT: ret <4 x float> [[TMP1]] +; CHECK-LABEL: @fmul_fdiv_v4f32_const( +; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basic-aa -slp-vectorizer -instcombine -S | FileCheck %s define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: @fadd_fsub_v8f32( @@ -91,28 +91,9 @@ } define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { -; SSE-LABEL: @fmul_fdiv_v4f32_const( -; SSE-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; SSE-NEXT: ret <4 x float> [[TMP1]] -; -; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], -; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[A2]], i64 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i64 3 -; SLM-NEXT: ret <4 x float> [[R3]] -; -; AVX-LABEL: @fmul_fdiv_v4f32_const( -; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; AVX-NEXT: ret <4 x float> [[TMP1]] -; -; AVX512-LABEL: @fmul_fdiv_v4f32_const( -; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], -; AVX512-NEXT: ret <4 x float> [[TMP1]] +; CHECK-LABEL: @fmul_fdiv_v4f32_const( +; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], +; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4 +; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE2 +; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE4 ; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX ; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512 @@ -15,39 +15,71 @@ ; } define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) { -; SSE-LABEL: @bitmask_16xi8( -; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 -; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 -; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 -; SSE-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> -; SSE-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 -; SSE-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer -; SSE-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> -; SSE-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; SSE-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 -; SSE-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 -; SSE-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 -; SSE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 -; SSE-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 -; SSE-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; SSE-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 -; SSE-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; SSE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) -; SSE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] -; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]] -; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]] -; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] -; SSE-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]] -; SSE-NEXT: ret i64 [[OP_RDX4]] +; SSE2-LABEL: @bitmask_16xi8( +; SSE2-NEXT: entry: +; SSE2-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 +; SSE2-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 +; SSE2-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 +; SSE2-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer +; SSE2-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> +; SSE2-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 +; SSE2-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 +; SSE2-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; SSE2-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> +; SSE2-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 +; SSE2-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 +; SSE2-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 +; SSE2-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 +; SSE2-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 +; SSE2-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 +; SSE2-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 +; SSE2-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 +; SSE2-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 +; SSE2-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; SSE2-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 +; SSE2-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 +; SSE2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) +; SSE2-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] +; SSE2-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]] +; SSE2-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]] +; SSE2-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] +; SSE2-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]] +; SSE2-NEXT: ret i64 [[OP_RDX4]] +; +; SSE4-LABEL: @bitmask_16xi8( +; SSE4-NEXT: entry: +; SSE4-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 +; SSE4-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 +; SSE4-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE4-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; SSE4-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 +; SSE4-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer +; SSE4-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> +; SSE4-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 +; SSE4-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 +; SSE4-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; SSE4-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> +; SSE4-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 +; SSE4-NEXT: [[TMP7:%.*]] = load <2 x i8>, ptr [[ARRAYIDX_13]], align 1 +; SSE4-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[TMP7]], zeroinitializer +; SSE4-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i64> zeroinitializer, <2 x i64> +; SSE4-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 +; SSE4-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; SSE4-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP10]], 0 +; SSE4-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 +; SSE4-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) +; SSE4-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) +; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP11]], [[TMP12]] +; SSE4-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 +; SSE4-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 +; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP13]], [[TMP14]] +; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]] +; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] +; SSE4-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]] +; SSE4-NEXT: ret i64 [[OP_RDX4]] ; ; AVX-LABEL: @bitmask_16xi8( ; AVX-NEXT: entry: @@ -63,21 +95,19 @@ ; AVX-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer ; AVX-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> ; AVX-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; AVX-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 -; AVX-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 -; AVX-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 -; AVX-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 -; AVX-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 -; AVX-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 -; AVX-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 +; AVX-NEXT: [[TMP7:%.*]] = load <2 x i8>, ptr [[ARRAYIDX_13]], align 1 +; AVX-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[TMP7]], zeroinitializer +; AVX-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i64> zeroinitializer, <2 x i64> ; AVX-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; AVX-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; AVX-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 +; AVX-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; AVX-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP10]], 0 ; AVX-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; AVX-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) -; AVX-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] -; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]] +; AVX-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) +; AVX-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) +; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP11]], [[TMP12]] +; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 +; AVX-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 +; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP13]], [[TMP14]] ; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]] ; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] ; AVX-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]] @@ -198,33 +228,59 @@ } define i64 @bitmask_4xi16(ptr nocapture noundef readonly %src) { -; SSE-LABEL: @bitmask_4xi16( -; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2 -; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0 -; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2 -; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 -; SSE-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 -; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 -; SSE-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 -; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 -; SSE-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 -; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] -; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] -; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] -; SSE-NEXT: ret i64 [[OP_RDX3]] +; SSE2-LABEL: @bitmask_4xi16( +; SSE2-NEXT: entry: +; SSE2-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2 +; SSE2-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0 +; SSE2-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2 +; SSE2-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer +; SSE2-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> +; SSE2-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 +; SSE2-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 +; SSE2-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 +; SSE2-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; SSE2-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 +; SSE2-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 +; SSE2-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 +; SSE2-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; SSE2-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 +; SSE2-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; SSE2-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 +; SSE2-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 +; SSE2-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] +; SSE2-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] +; SSE2-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] +; SSE2-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] +; SSE2-NEXT: ret i64 [[OP_RDX3]] +; +; SSE4-LABEL: @bitmask_4xi16( +; SSE4-NEXT: entry: +; SSE4-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2 +; SSE4-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0 +; SSE4-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE4-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1 +; SSE4-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2 +; SSE4-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer +; SSE4-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> +; SSE4-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 +; SSE4-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_5]], align 2 +; SSE4-NEXT: [[TMP5:%.*]] = icmp eq <2 x i16> [[TMP4]], zeroinitializer +; SSE4-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> +; SSE4-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 +; SSE4-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; SSE4-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP7]], 0 +; SSE4-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 +; SSE4-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE4-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE4-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] +; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] +; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] +; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] +; SSE4-NEXT: ret i64 [[OP_RDX3]] ; ; AVX-LABEL: @bitmask_4xi16( ; AVX-NEXT: entry: @@ -236,22 +292,20 @@ ; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer ; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 -; AVX-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 -; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 -; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 -; AVX-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 -; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 -; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; AVX-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_5]], align 2 +; AVX-NEXT: [[TMP5:%.*]] = icmp eq <2 x i16> [[TMP4]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> ; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 -; AVX-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 +; AVX-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP7]], 0 ; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] +; AVX-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] ; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] ; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] +; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] ; AVX-NEXT: ret i64 [[OP_RDX3]] ; ; AVX512-LABEL: @bitmask_4xi16( @@ -323,33 +377,59 @@ } define i64 @bitmask_8xi32(ptr nocapture noundef readonly %src) { -; SSE-LABEL: @bitmask_8xi32( -; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0 -; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4 -; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 -; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 -; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 -; SSE-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 -; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 -; SSE-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 -; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] -; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] -; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] -; SSE-NEXT: ret i64 [[OP_RDX3]] +; SSE2-LABEL: @bitmask_8xi32( +; SSE2-NEXT: entry: +; SSE2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 +; SSE2-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0 +; SSE2-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +; SSE2-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> +; SSE2-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 +; SSE2-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 +; SSE2-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 +; SSE2-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 +; SSE2-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 +; SSE2-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 +; SSE2-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 +; SSE2-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; SSE2-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 +; SSE2-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 +; SSE2-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 +; SSE2-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 +; SSE2-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] +; SSE2-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] +; SSE2-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] +; SSE2-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] +; SSE2-NEXT: ret i64 [[OP_RDX3]] +; +; SSE4-LABEL: @bitmask_8xi32( +; SSE4-NEXT: entry: +; SSE4-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 +; SSE4-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0 +; SSE4-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE4-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1 +; SSE4-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4 +; SSE4-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +; SSE4-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> +; SSE4-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 +; SSE4-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_5]], align 4 +; SSE4-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[TMP4]], zeroinitializer +; SSE4-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> +; SSE4-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 +; SSE4-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 +; SSE4-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP7]], 0 +; SSE4-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 +; SSE4-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE4-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE4-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] +; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] +; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] +; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] +; SSE4-NEXT: ret i64 [[OP_RDX3]] ; ; AVX-LABEL: @bitmask_8xi32( ; AVX-NEXT: entry: @@ -361,22 +441,20 @@ ; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer ; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 -; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 -; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 -; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 -; AVX-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 -; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 -; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; AVX-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_5]], align 4 +; AVX-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[TMP4]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> ; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 -; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 -; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 +; AVX-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 +; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP7]], 0 ; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] +; AVX-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] ; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] ; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] +; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] ; AVX-NEXT: ret i64 [[OP_RDX3]] ; ; AVX512-LABEL: @bitmask_8xi32( @@ -500,22 +578,20 @@ ; SSE4-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer ; SSE4-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; SSE4-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5 -; SSE4-NEXT: [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX_5]], align 8 -; SSE4-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0 -; SSE4-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; SSE4-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6 -; SSE4-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8 -; SSE4-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i64 [[TMP5]], 0 -; SSE4-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; SSE4-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_5]], align 8 +; SSE4-NEXT: [[TMP5:%.*]] = icmp eq <2 x i64> [[TMP4]], zeroinitializer +; SSE4-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> ; SSE4-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7 -; SSE4-NEXT: [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 -; SSE4-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0 +; SSE4-NEXT: [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 +; SSE4-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP7]], 0 ; SSE4-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE4-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] +; SSE4-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; SSE4-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE4-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] ; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] ; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] +; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] ; SSE4-NEXT: ret i64 [[OP_RDX3]] ; ; AVX-LABEL: @bitmask_8xi64( @@ -528,22 +604,20 @@ ; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer ; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> ; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5 -; AVX-NEXT: [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX_5]], align 8 -; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0 -; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6 -; AVX-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8 -; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i64 [[TMP5]], 0 -; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 +; AVX-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_5]], align 8 +; AVX-NEXT: [[TMP5:%.*]] = icmp eq <2 x i64> [[TMP4]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> ; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7 -; AVX-NEXT: [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 -; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0 +; AVX-NEXT: [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 +; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP7]], 0 ; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]] +; AVX-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]] ; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]] ; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]] +; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]] ; AVX-NEXT: ret i64 [[OP_RDX3]] ; ; AVX512-LABEL: @bitmask_8xi64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll @@ -73,17 +73,16 @@ ; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 ; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[MUL88]], i32 1 ; CHECK-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1 -; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0 -; CHECK-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false +; CHECK-NEXT: [[TMP34:%.*]] = fcmp olt <2 x double> [[TMP33]], +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP34]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP34]], i32 1 +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[TMP36]], i1 [[TMP35]], i1 false ; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1 -; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]] +; CHECK-NEXT: [[TMP37:%.*]] = fcmp ule <2 x double> [[TMP33]], +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP37]], i32 0 +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP37]], i32 1 +; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP39]], i1 true, i1 [[TMP38]] ; CHECK-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 ; CHECK-NEXT: br label [[CLEANUP]] ; CHECK: cleanup: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll @@ -10,21 +10,26 @@ ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INIT:%.*]] = load double, double* @a, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[INIT]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[INIT]], i32 1 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[PHI:%.*]] = phi double [ [[ADD2:%.*]], [[LOOP]] ], [ [[INIT]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[POSTADD1_PHI:%.*]] = phi double [ [[POSTADD1:%.*]], [[LOOP]] ], [ [[INIT]], [[ENTRY]] ] -; CHECK-NEXT: [[POSTADD2_PHI:%.*]] = phi double [ [[POSTADD2:%.*]], [[LOOP]] ], [ [[INIT]], [[ENTRY]] ] -; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[POSTADD1_PHI]], undef -; CHECK-NEXT: [[ADD2]] = fadd double [[POSTADD2_PHI]], [[PHI]] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP9:%.*]], [[LOOP]] ], [ [[TMP1]], [[ENTRY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[TMP3]], undef +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD2]] = fadd double [[TMP4]], [[PHI]] ; CHECK-NEXT: [[MUL2:%.*]] = fmul double [[ADD2]], 0.000000e+00 -; CHECK-NEXT: [[BINARYOP_B:%.*]] = fadd double [[POSTADD1_PHI]], [[MUL2]] +; CHECK-NEXT: [[BINARYOP_B:%.*]] = fadd double [[TMP3]], [[MUL2]] ; CHECK-NEXT: [[MUL1:%.*]] = fmul double [[ADD1]], 0.000000e+00 -; CHECK-NEXT: [[TMP:%.*]] = fadd double [[POSTADD2_PHI]], 0.000000e+00 -; CHECK-NEXT: [[BINARY_V:%.*]] = fadd double [[MUL1]], [[BINARYOP_B]] -; CHECK-NEXT: [[POSTADD1]] = fadd double [[BINARY_V]], 0.000000e+00 -; CHECK-NEXT: [[POSTADD2]] = fadd double [[TMP]], 1.000000e+00 -; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une double [[POSTADD1]], 0.000000e+00 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> , double [[BINARYOP_B]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9]] = fadd <2 x double> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 +; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une double [[TMP10]], 0.000000e+00 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: ret i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll @@ -14,18 +14,19 @@ ; CHECK-NEXT: ret void ; CHECK: if.else: ; CHECK-NEXT: [[M_NUMCONSTRAINTROWS4:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[NUB5:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO]], i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]] ; CHECK: land.lhs.true.i.1: ; CHECK-NEXT: br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]] ; CHECK: if.then7.1: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: store i32 1, i32* [[M_NUMCONSTRAINTROWS4]], align 4 +; CHECK-NEXT: store i32 5, i32* [[NUB5]], align 4 ; CHECK-NEXT: br label [[FOR_INC_1]] ; CHECK: for.inc.1: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ , [[IF_THEN7_1]] ], [ , [[LAND_LHS_TRUE_I_1]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ , [[IF_THEN7_1]] ], [ , [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP1]], <2 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll @@ -4,23 +4,14 @@ define i32 @crash_reordering_undefs() { ; CHECK-LABEL: @crash_reordering_undefs( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]] -; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef -; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef -; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537 -; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]] -; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[ADD0]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[TMP0]], [[OP_RDX3]] -; CHECK-NEXT: ret i32 [[OP_RDX4]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> undef, <4 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> undef, [[SHUFFLE]] +; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], undef +; CHECK-NEXT: ret i32 [[OP_RDX1]] ; entry: %or0 = or i64 undef, undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll @@ -27,24 +27,25 @@ ; CHECK: land.rhs.lr.ph: ; CHECK-NEXT: unreachable ; CHECK: if.end98: +; CHECK-NEXT: [[FROM299:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE167]], label [[IF_THEN103:%.*]] ; CHECK: if.then103: -; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 ; CHECK-NEXT: [[DOTSUB100:%.*]] = select i1 undef, i32 250, i32 undef ; CHECK-NEXT: [[MUL114:%.*]] = shl nsw i32 [[DOTSUB100]], 2 +; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 ; CHECK-NEXT: [[COND125:%.*]] = select i1 undef, i32 undef, i32 [[MUL114]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[COND125]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[DOTSUB100]], i32 1 ; CHECK-NEXT: br label [[FOR_COND_I:%.*]] ; CHECK: for.cond.i: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ undef, [[LAND_RHS_I874:%.*]] ], [ [[TMP1]], [[IF_THEN103]] ] +; CHECK-NEXT: [[ROW_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874:%.*]] ], [ [[DOTSUB100]], [[IF_THEN103]] ] +; CHECK-NEXT: [[COL_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874]] ], [ [[COND125]], [[IF_THEN103]] ] ; CHECK-NEXT: br i1 undef, label [[LAND_RHS_I874]], label [[FOR_END_I:%.*]] ; CHECK: land.rhs.i874: ; CHECK-NEXT: br i1 undef, label [[FOR_COND_I]], label [[FOR_END_I]] ; CHECK: for.end.i: ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]] ; CHECK: if.then.i: -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], undef +; CHECK-NEXT: [[ADD14_I:%.*]] = add nsw i32 [[ROW_0_I]], undef +; CHECK-NEXT: [[ADD15_I:%.*]] = add nsw i32 [[COL_0_I]], undef ; CHECK-NEXT: br label [[EXTEND_BW_EXIT:%.*]] ; CHECK: if.end.i: ; CHECK-NEXT: [[ADD16_I:%.*]] = add i32 [[COND125]], [[DOTSUB100]] @@ -65,12 +66,14 @@ ; CHECK: while.end275.i: ; CHECK-NEXT: br label [[EXTEND_BW_EXIT]] ; CHECK: extend_bw.exit: -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ [[TMP3]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: [[ADD14_I1262:%.*]] = phi i32 [ [[ADD14_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: [[ADD15_I1261:%.*]] = phi i32 [ [[ADD15_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] ; CHECK-NEXT: br i1 false, label [[IF_THEN157:%.*]], label [[LAND_LHS_TRUE167]] ; CHECK: if.then157: -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[FROM1115]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[ADD158:%.*]] = add nsw i32 [[ADD14_I1262]], 1 +; CHECK-NEXT: store i32 [[ADD158]], i32* [[FROM299]], align 4 +; CHECK-NEXT: [[ADD160:%.*]] = add nsw i32 [[ADD15_I1261]], 1 +; CHECK-NEXT: store i32 [[ADD160]], i32* [[FROM1115]], align 4 ; CHECK-NEXT: br label [[LAND_LHS_TRUE167]] ; CHECK: land.lhs.true167: ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -18,18 +18,13 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[MUL11]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x double> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[G]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP6]], <4 x double>* [[TMP7]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll @@ -7,17 +7,23 @@ define i32 @foo(i32* nocapture %A, i32 %n, i32 %m) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: [[EXTERNALUSE1:%.*]] = add nsw i32 [[TMP6]], [[M:%.*]] -; CHECK-NEXT: [[EXTERNALUSE2:%.*]] = mul nsw i32 [[TMP6]], [[M]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[N:%.*]], 5 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], 9 +; CHECK-NEXT: store i32 [[ADD]], i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[MUL1:%.*]] = mul nsw i32 [[N]], 9 +; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[MUL1]], 9 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 +; CHECK-NEXT: store i32 [[ADD2]], i32* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[MUL4:%.*]] = shl i32 [[N]], 3 +; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[MUL4]], 9 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; CHECK-NEXT: store i32 [[ADD5]], i32* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[MUL7:%.*]] = mul nsw i32 [[N]], 10 +; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[MUL7]], 9 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[EXTERNALUSE1:%.*]] = add nsw i32 [[ADD]], [[M:%.*]] +; CHECK-NEXT: [[EXTERNALUSE2:%.*]] = mul nsw i32 [[ADD]], [[M]] ; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[EXTERNALUSE1]], [[EXTERNALUSE2]] ; CHECK-NEXT: ret i32 [[ADD10]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -12,35 +12,37 @@ ; CHECK-NEXT: br i1 [[TOBOOL_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[C_022:%.*]] = phi i32* [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32*> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[B_021:%.*]] = phi i32* [ [[B_021_BE:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[A_020:%.*]] = phi i32* [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint i32* [[C_022]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [ +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint i32* [[C_022]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 1 +; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[B_021]], i64 1 +; CHECK-NEXT: switch i32 [[TMP2]], label [[WHILE_BODY_BACKEDGE]] [ ; CHECK-NEXT: i32 2, label [[SW_BB:%.*]] ; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint i32* [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[TMP9]], align 4 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[B_021]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint i32* [[INCDEC_PTR2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 2 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: sw.bb6: +; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[A_020]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; CHECK-NEXT: [[INCDEC_PTR9:%.*]] = getelementptr inbounds i32, i32* [[B_021]], i64 2 +; CHECK-NEXT: store i32 [[TMP6]], i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: while.body.backedge: ; CHECK-NEXT: [[C_022_BE]] = phi i32* [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] -; CHECK-NEXT: [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ] +; CHECK-NEXT: [[B_021_BE]] = phi i32* [ [[INCDEC_PTR2]], [[WHILE_BODY]] ], [ [[INCDEC_PTR9]], [[SW_BB6]] ], [ [[INCDEC_PTR3]], [[SW_BB]] ] +; CHECK-NEXT: [[A_020_BE]] = phi i32* [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR7]], [[SW_BB6]] ], [ [[INCDEC_PTR4]], [[SW_BB]] ] ; CHECK-NEXT: br label [[WHILE_BODY]] ; CHECK: while.end: ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1411,53 +1411,15 @@ ; This should not crash. define void @PR49730() { -; SSE-LABEL: @PR49730( -; SSE-NEXT: [[T:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 2) -; SSE-NEXT: [[T1:%.*]] = sub nsw i32 undef, [[T]] -; SSE-NEXT: [[T2:%.*]] = call i32 @llvm.umin.i32(i32 undef, i32 [[T1]]) -; SSE-NEXT: [[T3:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 2) -; SSE-NEXT: [[T4:%.*]] = sub nsw i32 undef, [[T3]] -; SSE-NEXT: [[T5:%.*]] = call i32 @llvm.umin.i32(i32 [[T2]], i32 [[T4]]) -; SSE-NEXT: [[T6:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 1) -; SSE-NEXT: [[T7:%.*]] = sub nuw nsw i32 undef, [[T6]] -; SSE-NEXT: [[T8:%.*]] = call i32 @llvm.umin.i32(i32 [[T5]], i32 [[T7]]) -; SSE-NEXT: [[T9:%.*]] = call i32 @llvm.smin.i32(i32 undef, i32 1) -; SSE-NEXT: [[T10:%.*]] = sub nsw i32 undef, [[T9]] -; SSE-NEXT: [[T11:%.*]] = call i32 @llvm.umin.i32(i32 [[T8]], i32 [[T10]]) -; SSE-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef -; SSE-NEXT: [[T13:%.*]] = call i32 @llvm.umin.i32(i32 [[T11]], i32 [[T12]]) -; SSE-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[T13]], i32 93) -; SSE-NEXT: ret void -; -; AVX-LABEL: @PR49730( -; AVX-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> ) -; AVX-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] -; AVX-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef -; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef) -; AVX-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]]) -; AVX-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) -; AVX-NEXT: ret void -; -; AVX2-LABEL: @PR49730( -; AVX2-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> ) -; AVX2-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] -; AVX2-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef -; AVX2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) -; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef) -; AVX2-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]]) -; AVX2-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) -; AVX2-NEXT: ret void -; -; THRESH-LABEL: @PR49730( -; THRESH-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> ) -; THRESH-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] -; THRESH-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef -; THRESH-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) -; THRESH-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef) -; THRESH-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]]) -; THRESH-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) -; THRESH-NEXT: ret void +; CHECK-LABEL: @PR49730( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> ) +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] +; CHECK-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]]) +; CHECK-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93) +; CHECK-NEXT: ret void ; %t = call i32 @llvm.smin.i32(i32 undef, i32 2) %t1 = sub nsw i32 undef, %t diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -447,26 +447,28 @@ ; ALL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ] ; ALL-NEXT: [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2 ; ALL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]] -; ALL-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; ALL-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 1 -; ALL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] -; ALL-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4 -; ALL-NEXT: [[TMP4:%.*]] = or i64 [[TMP0]], 2 -; ALL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]] -; ALL-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4 -; ALL-NEXT: [[TMP6:%.*]] = or i64 [[TMP0]], 3 -; ALL-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]] -; ALL-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; ALL-NEXT: [[TMP1:%.*]] = or i64 [[TMP0]], 1 +; ALL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP1]] +; ALL-NEXT: [[TMP2:%.*]] = or i64 [[TMP0]], 2 +; ALL-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]] +; ALL-NEXT: [[TMP3:%.*]] = or i64 [[TMP0]], 3 +; ALL-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP3]] +; ALL-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; ALL-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 +; ALL-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP5]], i32 0 +; ALL-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i32 1 +; ALL-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i32 2 +; ALL-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP5]], i32 3 ; ALL-NEXT: br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]] ; ALL: for.body16.lr.ph: ; ALL-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]] -; ALL-NEXT: [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4 +; ALL-NEXT: [[TMP10:%.*]] = load float, float* [[ADD_PTR]], align 4 ; ALL-NEXT: br label [[FOR_BODY16:%.*]] ; ALL: for.cond.cleanup15: -; ALL-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W2_0_LCSSA:%.*]] = phi float [ [[TMP8]], [[FOR_BODY]] ], [ [[OP_RDX:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W3_0_LCSSA:%.*]] = phi float [ [[TMP9]], [[FOR_BODY]] ], [ [[TMP24:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W1_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP12:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[W0_0_LCSSA:%.*]] = phi float [ [[TMP6]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ] ; ALL-NEXT: store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4 ; ALL-NEXT: store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4 ; ALL-NEXT: store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4 @@ -475,26 +477,27 @@ ; ALL-NEXT: [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6 ; ALL-NEXT: br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] ; ALL: for.body16: -; ALL-NEXT: [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ] ; ALL-NEXT: [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ] -; ALL-NEXT: [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ] -; ALL-NEXT: [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000 -; ALL-NEXT: [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000 -; ALL-NEXT: [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]] -; ALL-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]] +; ALL-NEXT: [[TMP11:%.*]] = phi <4 x float> [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[TMP23:%.*]], [[FOR_BODY16]] ] +; ALL-NEXT: [[TMP12]] = extractelement <4 x float> [[TMP11]], i32 0 +; ALL-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1 +; ALL-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i32 0 +; ALL-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i32 1 +; ALL-NEXT: [[TMP16:%.*]] = fmul fast <2 x float> [[TMP15]], +; ALL-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 +; ALL-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 +; ALL-NEXT: [[SUB92:%.*]] = fadd fast float [[TMP17]], [[TMP18]] +; ALL-NEXT: [[SUB19]] = fadd fast float [[SUB92]], [[TMP10]] ; ALL-NEXT: [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000 -; ALL-NEXT: [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000 -; ALL-NEXT: [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000 -; ALL-NEXT: [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000 -; ALL-NEXT: [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000 -; ALL-NEXT: [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]] -; ALL-NEXT: [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]] -; ALL-NEXT: [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]] -; ALL-NEXT: [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]] +; ALL-NEXT: [[TMP19:%.*]] = fmul fast <4 x float> [[TMP11]], +; ALL-NEXT: [[TMP20:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP19]]) +; ALL-NEXT: [[OP_RDX]] = fadd fast float [[TMP20]], [[MUL20]] ; ALL-NEXT: [[INC]] = add nuw i32 [[J_098]], 1 ; ALL-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]] +; ALL-NEXT: [[TMP21:%.*]] = insertelement <4 x float> poison, float [[SUB19]], i32 0 +; ALL-NEXT: [[TMP22:%.*]] = shufflevector <4 x float> [[TMP21]], <4 x float> [[TMP11]], <4 x i32> +; ALL-NEXT: [[TMP23]] = insertelement <4 x float> [[TMP22]], float [[OP_RDX]], i32 2 +; ALL-NEXT: [[TMP24]] = extractelement <4 x float> [[TMP11]], i32 2 ; ALL-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -482,37 +482,23 @@ define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { -; SSE-LABEL: @ExtractIdxNotConstantInt1( -; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef -; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] -; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] -; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; SSE-NEXT: ret i1 [[CMP_I185]] -; -; AVX-LABEL: @ExtractIdxNotConstantInt1( -; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef -; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] -; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] -; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] -; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 -; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] -; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 -; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] -; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] -; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; AVX-NEXT: ret i1 [[CMP_I185]] +; CHECK-LABEL: @ExtractIdxNotConstantInt1( +; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef +; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] +; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP_I185]] ; %vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef %sub14.i167 = fsub float undef, %vecext.i291.i166 @@ -530,37 +516,23 @@ define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { -; SSE-LABEL: @ExtractIdxNotConstantInt2( -; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 -; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] -; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] -; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; SSE-NEXT: ret i1 [[CMP_I185]] -; -; AVX-LABEL: @ExtractIdxNotConstantInt2( -; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 -; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] -; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] -; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] -; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 -; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] -; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 -; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] -; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] -; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; AVX-NEXT: ret i1 [[CMP_I185]] +; CHECK-LABEL: @ExtractIdxNotConstantInt2( +; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 +; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] +; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP_I185]] ; %vecext.i291.i166 = extractelement <4 x float> %vec, i64 1 %sub14.i167 = fsub float undef, %vecext.i291.i166 @@ -578,37 +550,23 @@ define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { -; SSE-LABEL: @foo( -; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 -; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] -; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] -; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; SSE-NEXT: ret i1 [[CMP_I185]] -; -; AVX-LABEL: @foo( -; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 -; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] -; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] -; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 -; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 -; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] -; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 -; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] -; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] -; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; AVX-NEXT: ret i1 [[CMP_I185]] +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 +; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] +; CHECK-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP_I185]] ; %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0 %sub14.i167 = fsub float undef, %vecext.i291.i166 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+sse2 -S | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -slp-threshold=-6 -slp-vectorizer -instcombine -mattr=+avx2 -S | FileCheck %s --check-prefixes=AVX target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -13,21 +13,34 @@ ; zero-extend the roots back to their original sizes. ; define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { -; CHECK-LABEL: @PR31243_zext( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1 -; CHECK-NEXT: [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1 -; CHECK-NEXT: [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]] -; CHECK-NEXT: ret i8 [[TMP_8]] +; SSE-LABEL: @PR31243_zext( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP0:%.*]] = or i8 [[V0:%.*]], 1 +; SSE-NEXT: [[TMP1:%.*]] = or i8 [[V1:%.*]], 1 +; SSE-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i64 +; SSE-NEXT: [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP2]] +; SSE-NEXT: [[TMP3:%.*]] = zext i8 [[TMP1]] to i64 +; SSE-NEXT: [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP3]] +; SSE-NEXT: [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1 +; SSE-NEXT: [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1 +; SSE-NEXT: [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]] +; SSE-NEXT: ret i8 [[TMP_8]] +; +; AVX-LABEL: @PR31243_zext( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0 +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1 +; AVX-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], +; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0 +; AVX-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 +; AVX-NEXT: [[TMP_4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]] +; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1 +; AVX-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 +; AVX-NEXT: [[TMP_5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]] +; AVX-NEXT: [[TMP_6:%.*]] = load i8, i8* [[TMP_4]], align 1 +; AVX-NEXT: [[TMP_7:%.*]] = load i8, i8* [[TMP_5]], align 1 +; AVX-NEXT: [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]] +; AVX-NEXT: ret i8 [[TMP_8]] ; entry: %tmp_0 = zext i8 %v0 to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -67,25 +67,32 @@ ; SSE-LABEL: @pr35497( ; SSE-NEXT: entry: ; SSE-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 +; SSE-NEXT: [[AND:%.*]] = shl i64 [[TMP0]], 2 +; SSE-NEXT: [[SHL:%.*]] = and i64 [[AND]], 20 ; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef ; SSE-NEXT: store i64 [[ADD]], i64* undef, align 1 +; SSE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 +; SSE-NEXT: [[AND_1:%.*]] = shl i64 undef, 2 +; SSE-NEXT: [[SHL_1:%.*]] = and i64 [[AND_1]], 20 +; SSE-NEXT: [[SHR_1:%.*]] = lshr i64 undef, 6 +; SSE-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[SHL]], [[SHR_1]] ; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 +; SSE-NEXT: [[SHR_2:%.*]] = lshr i64 undef, 6 +; SSE-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[SHL_1]], [[SHR_2]] +; SSE-NEXT: [[AND_4:%.*]] = shl i64 [[ADD]], 2 +; SSE-NEXT: [[SHL_4:%.*]] = and i64 [[AND_4]], 20 +; SSE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 +; SSE-NEXT: store i64 [[ADD_1]], i64* [[ARRAYIDX2_5]], align 1 +; SSE-NEXT: [[AND_5:%.*]] = shl nuw nsw i64 [[ADD_1]], 2 +; SSE-NEXT: [[SHL_5:%.*]] = and i64 [[AND_5]], 20 +; SSE-NEXT: [[SHR_5:%.*]] = lshr i64 [[ADD_1]], 6 +; SSE-NEXT: [[ADD_5:%.*]] = add nuw nsw i64 [[SHL_4]], [[SHR_5]] +; SSE-NEXT: store i64 [[ADD_5]], i64* [[ARRAYIDX2_1]], align 1 ; SSE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 -; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], -; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer -; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1 -; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], -; SSE-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], -; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], -; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] -; SSE-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* -; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 +; SSE-NEXT: store i64 [[ADD_2]], i64* [[ARRAYIDX2_6]], align 1 +; SSE-NEXT: [[SHR_6:%.*]] = lshr i64 [[ADD_2]], 6 +; SSE-NEXT: [[ADD_6:%.*]] = add nuw nsw i64 [[SHL_5]], [[SHR_6]] +; SSE-NEXT: store i64 [[ADD_6]], i64* [[ARRAYIDX2_2]], align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @pr35497( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -199,29 +199,33 @@ ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 ; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 -; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 -; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], -; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], +; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 +; AVX-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 +; AVX-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], +; AVX-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( @@ -402,6 +406,7 @@ ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 @@ -410,21 +415,24 @@ ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 -; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1 +; AVX-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2 +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -199,29 +199,33 @@ ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 ; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 -; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 -; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], -; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 4 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 +; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], +; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> poison, i32 [[TMP21]], i64 0 +; AVX-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP22]], i64 1 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP23]], i64 2 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP24]], i64 3 +; AVX-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], +; AVX-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP30]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( @@ -402,6 +406,7 @@ ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 @@ -410,21 +415,24 @@ ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T3]], i64 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], +; AVX-NEXT: [[TMP6:%.*]] = bitcast i32* [[T0]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 -; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* -; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T19]], i64 0 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T23]], i64 1 +; AVX-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T27]], i64 2 +; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[T31]], i64 3 +; AVX-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = bitcast i32* [[T17]] to <4 x i32>* +; AVX-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -227,15 +227,37 @@ ; logic...or a wide reduction? define i1 @logical_and_icmp_clamp(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; SSE-LABEL: @logical_and_icmp_clamp( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]]) +; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP1]] +; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP4]], i1 [[TMP6]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -260,17 +282,40 @@ } define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: call void @use1(i1 [[TMP2]]) -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 [[TMP7]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; SSE-LABEL: @logical_and_icmp_clamp_extra_use_cmp( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; SSE-NEXT: call void @use1(i1 [[TMP2]]) +; SSE-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]]) +; SSE-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP1]] +; SSE-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 [[TMP7]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp_extra_use_cmp( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: call void @use1(i1 [[C2]]) +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -296,21 +341,44 @@ } define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_select( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false -; CHECK-NEXT: call void @use1(i1 [[S2]]) -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 [[S2]], i1 false -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP7]], i1 [[OP_RDX]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX1]] +; SSE-LABEL: @logical_and_icmp_clamp_extra_use_select( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; SSE-NEXT: [[S1:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP5]], i1 false +; SSE-NEXT: call void @use1(i1 [[S2]]) +; SSE-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]] +; SSE-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; SSE-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 [[S2]], i1 false +; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP7]], i1 [[OP_RDX]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX1]] +; +; AVX-LABEL: @logical_and_icmp_clamp_extra_use_select( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: call void @use1(i1 [[S2]]) +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -386,38 +454,20 @@ } define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { -; SSE-LABEL: @logical_and_icmp_clamp_partial( -; SSE-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 -; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = icmp slt <2 x i32> [[TMP5]], -; SSE-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 -; SSE-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[X]], -; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]]) -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 -; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 -; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false -; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false -; SSE-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 [[OP_RDX1]], i1 false -; SSE-NEXT: ret i1 [[OP_RDX2]] -; -; AVX-LABEL: @logical_and_icmp_clamp_partial( -; AVX-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 -; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 -; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP3]], 42 -; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP2]], 42 -; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], -; AVX-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] -; AVX-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false -; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false -; AVX-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP6]], i1 [[OP_RDX1]], i1 false -; AVX-NEXT: ret i1 [[OP_RDX2]] +; CHECK-LABEL: @logical_and_icmp_clamp_partial( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[X]], i32 0 +; CHECK-NEXT: [[C0:%.*]] = icmp slt i32 [[TMP3]], 42 +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[TMP2]], 42 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[TMP1]], 42 +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false +; CHECK-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP6]], i1 [[OP_RDX1]], i1 false +; CHECK-NEXT: ret i1 [[OP_RDX2]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 @@ -442,17 +492,39 @@ } define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; SSE-LABEL: @logical_and_icmp_clamp_pred_diff( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; SSE-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]] +; SSE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp_pred_diff( +; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 +; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 +; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 +; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 +; AVX-NEXT: [[C0:%.*]] = icmp slt i32 [[X0]], 42 +; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 42 +; AVX-NEXT: [[C2:%.*]] = icmp slt i32 [[X2]], 42 +; AVX-NEXT: [[C3:%.*]] = icmp ult i32 [[X3]], 42 +; AVX-NEXT: [[D0:%.*]] = icmp sgt i32 [[X0]], 17 +; AVX-NEXT: [[D1:%.*]] = icmp sgt i32 [[X1]], 17 +; AVX-NEXT: [[D2:%.*]] = icmp sgt i32 [[X2]], 17 +; AVX-NEXT: [[D3:%.*]] = icmp sgt i32 [[X3]], 17 +; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false +; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false +; AVX-NEXT: [[S4:%.*]] = select i1 [[S3]], i1 [[D0]], i1 false +; AVX-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; AVX-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; AVX-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; AVX-NEXT: ret i1 [[S7]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -96,17 +96,16 @@ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[TMP10]], [[TMP9]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 -; CHECK-NEXT: [[NOT_OR_COND9:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP11:%.*]] = fcmp ule <2 x double> [[TMP7]], +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: [[NOT_OR_COND9:%.*]] = or i1 [[TMP12]], [[TMP13]] ; CHECK-NEXT: ret i1 [[NOT_OR_COND9]] ; CHECK: cleanup: ; CHECK-NEXT: ret i1 false diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -189,13 +189,14 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 +; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 +; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 +; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -222,21 +223,22 @@ define void @addsub1(i32* noalias %dst, i32* noalias %src) { ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 2 -; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 +; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 +; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP6]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP7]], -3 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ;