Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCasts.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -706,12 +706,35 @@ if (SimplifyDemandedInstructionBits(CI)) return &CI; - // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector. if (DestTy->getScalarSizeInBits() == 1) { - Constant *One = ConstantInt::get(SrcTy, 1); - Src = Builder.CreateAnd(Src, One); Value *Zero = Constant::getNullValue(Src->getType()); - return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); + if (DestTy->isIntegerTy()) { + // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only). + // TODO: We canonicalize to more instructions here because we are probably + // lacking equivalent analysis for trunc relative to icmp. There may also + // be codegen concerns. If those trunc limitations were removed, we could + // remove this transform. + Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } + + // For vectors, we do not canonicalize all truncs to icmp, so optimize + // patterns that would be covered within visitICmpInst. + Value *X; + const APInt *C; + if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) { + // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0 + APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C); + Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } + if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)), + m_Deferred(X))))) { + // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0 + APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1; + Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } } // FIXME: Maybe combine the next two transforms to handle the no cast case Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCompares.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1609,6 +1609,13 @@ Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And, const APInt &C1) { + // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1 + // TODO: We canonicalize to the longer form for scalars because we have + // better analysis/folds for icmp, and codegen may be better with icmp. + if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() && + C1.isNullValue() && match(And->getOperand(1), m_One())) + return new TruncInst(And->getOperand(0), Cmp.getType()); + const APInt *C2; if (!match(And->getOperand(1), m_APInt(C2))) return nullptr; Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1477,6 +1477,33 @@ return SelectInst::Create(NarrowCond, NarrowX, NarrowY); } +/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask. +static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) { + Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); + if (!Shuf.isIdentityWithExtract() || !isa(Op1)) + return nullptr; + + Value *X, *Y; + Constant *Mask; + if (!match(Op0, m_ShuffleVector(m_Value(X), m_Value(Y), m_Constant(Mask)))) + return nullptr; + + // We are extracting a subvector from a shuffle. Remove excess elements from + // the 1st shuffle mask to eliminate the extract. + // shuf (shuf X, Y, ), undef, <0, undef, 2> --> + // shuf X, Y, + unsigned NumElts = Shuf.getType()->getVectorNumElements(); + SmallVector NewMask(NumElts); + for (unsigned i = 0; i != NumElts; ++i) { + // If the extracting shuffle has an undef mask element, it transfers to the + // new shuffle mask. Otherwise, copy the original mask element. + Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i); + Constant *MaskElt = Mask->getAggregateElement(i); + NewMask[i] = isa(ExtractMaskElt) ? ExtractMaskElt : MaskElt; + } + return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask)); +} + Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); @@ -1499,6 +1526,9 @@ return &SVI; } + if (Instruction *I = foldIdentityExtractShuffle(SVI)) + return I; + SmallVector Mask = SVI.getShuffleMask(); Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); unsigned LHSWidth = LHS->getType()->getVectorNumElements(); Index: llvm/trunk/test/Transforms/InstCombine/apint-shift.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/apint-shift.ll +++ llvm/trunk/test/Transforms/InstCombine/apint-shift.ll @@ -319,8 +319,8 @@ define <2 x i1> @test16vec(<2 x i84> %X) { ; CHECK-LABEL: @test16vec( -; CHECK-NEXT: [[AND:%.*]] = and <2 x i84> %X, -; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[AND]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], +; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %shr = ashr <2 x i84> %X, Index: llvm/trunk/test/Transforms/InstCombine/apint-shl-trunc.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/apint-shl-trunc.ll +++ llvm/trunk/test/Transforms/InstCombine/apint-shl-trunc.ll @@ -27,9 +27,8 @@ define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) { ; CHECK-LABEL: @test0vec( -; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i39> , [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i39> [[TMP1]], [[X:%.*]] -; CHECK-NEXT: [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[B:%.*]] = lshr <2 x i39> [[X:%.*]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[D]] ; %B = lshr <2 x i39> %X, %A Index: llvm/trunk/test/Transforms/InstCombine/icmp.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/icmp.ll +++ llvm/trunk/test/Transforms/InstCombine/icmp.ll @@ -2427,10 +2427,9 @@ define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_and_or_lshr_vec( -; CHECK-NEXT: [[SHF1:%.*]] = shl nuw <2 x i32> , [[Y:%.*]] -; CHECK-NEXT: [[OR2:%.*]] = or <2 x i32> [[SHF1]], -; CHECK-NEXT: [[AND3:%.*]] = and <2 x i32> [[OR2]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND3]], zeroinitializer +; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[SHF]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %shf = lshr <2 x i32> %x, %y @@ -2445,8 +2444,7 @@ ; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], ; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], [[Y:%.*]] ; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]] -; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer +; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %x = srem <2 x i32> %xp, ; prevent complexity-based canonicalization @@ -2472,8 +2470,8 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) { ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec( -; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND1]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %shf = lshr <2 x i32> %x, @@ -2486,10 +2484,8 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec_commute(<2 x i32> %xp) { ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_commute( ; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], -; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], -; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]] -; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X]], +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %x = srem <2 x i32> %xp, ; prevent complexity-based canonicalization Index: llvm/trunk/test/Transforms/InstCombine/vec_shuffle.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/vec_shuffle.ll +++ llvm/trunk/test/Transforms/InstCombine/vec_shuffle.ll @@ -174,8 +174,7 @@ define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @extract_subvector_of_shuffle( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <3 x i32> -; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <3 x i8> [[SHUF]], <3 x i8> undef, <2 x i32> +; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> ; CHECK-NEXT: ret <2 x i8> [[EXTRACT_SUBV]] ; %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <3 x i32> @@ -194,7 +193,7 @@ ; CHECK-LABEL: @extract_subvector_of_shuffle_extra_use( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <5 x i32> ; CHECK-NEXT: call void @use_v5i8(<5 x i8> [[SHUF]]) -; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <5 x i8> [[SHUF]], <5 x i8> undef, <4 x i32> +; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X]], <2 x i8> [[Y]], <4 x i32> ; CHECK-NEXT: ret <4 x i8> [[EXTRACT_SUBV]] ; %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <5 x i32> Index: llvm/trunk/test/Transforms/InstCombine/vector-casts.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/vector-casts.ll +++ llvm/trunk/test/Transforms/InstCombine/vector-casts.ll @@ -1,26 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s -; This turns into a&1 != 0 -; TODO: The bar for canonicalizing to something bigger than the minimal length IR is very high. -; This pattern does not appear to meet that standard. +; Can't get smaller than this. define <2 x i1> @trunc(<2 x i64> %a) { ; CHECK-LABEL: @trunc( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], -; CHECK-NEXT: [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[T]] ; %t = trunc <2 x i64> %a to <2 x i1> ret <2 x i1> %t } -; TODO: This could be just 1 instruction (trunc). +; This is trunc. define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc( -; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], -; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer +; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t = and <2 x i64> %a, @@ -28,12 +24,11 @@ ret <2 x i1> %r } -; TODO: This could be just 1 instruction (trunc). +; This is trunc. define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt( -; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], -; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer +; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t = and <2 x i64> %a, @@ -41,7 +36,7 @@ ret <2 x i1> %r } -; TODO: This could be just 1 instruction (trunc). +; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete. define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts( Index: llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -2901,49 +2901,45 @@ ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12 ; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], -; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], -; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], -; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer -; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer -; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1> +; AVX-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1> +; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1> +; AVX-NEXT: [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1> +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef) +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4 +; AVX-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef) +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8 ; AVX-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x double*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 4 +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef) +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12 ; AVX-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x double*> undef) -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8 -; AVX-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x double*> undef) -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 12 -; AVX-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double*> undef) -; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] -; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] -; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] -; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] -; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4 +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef) +; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP23:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]] +; AVX-NEXT: [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]] +; AVX-NEXT: [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]] +; AVX-NEXT: [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]] +; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]]) +; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4 +; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]]) +; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 +; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]]) +; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12 ; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) -; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 -; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) -; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12 -; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 +; AVX-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -2953,14 +2949,14 @@ ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 -; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 ; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX: land.lhs.true: ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null +; AVX-NEXT: [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP39]], null ; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX: if.then: ; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -2998,49 +2994,45 @@ ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], -; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], -; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], -; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer -; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer -; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer -; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1> +; AVX512-NEXT: [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1> +; AVX512-NEXT: [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1> +; AVX512-NEXT: [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1> +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef) +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8 +; AVX512-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef) +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x double*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x double*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 16 -; AVX512-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x double*> undef) -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 24 -; AVX512-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double*> undef) -; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] -; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] -; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] -; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef) +; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP23:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]] +; AVX512-NEXT: [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]] +; AVX512-NEXT: [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]] +; AVX512-NEXT: [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]] +; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]]) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 +; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]]) +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16 +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]]) +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16 -; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24 -; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63 +; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63 ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3050,14 +3042,14 @@ ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX512-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null +; AVX512-NEXT: [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP39]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -3162,49 +3154,45 @@ ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12 ; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], -; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], -; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], -; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer -; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer -; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1> +; AVX-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1> +; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1> +; AVX-NEXT: [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1> +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4 +; AVX-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8 ; AVX-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 4 +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12 ; AVX-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8 -; AVX-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 12 -; AVX-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] -; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] -; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] -; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] -; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4 +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP23:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]] +; AVX-NEXT: [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]] +; AVX-NEXT: [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]] +; AVX-NEXT: [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]] +; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]]) +; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4 +; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]]) +; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 +; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]]) +; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12 ; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) -; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 -; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) -; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12 -; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 +; AVX-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3214,14 +3202,14 @@ ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 -; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 ; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX: land.lhs.true: ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null +; AVX-NEXT: [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null ; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX: if.then: ; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -3259,49 +3247,45 @@ ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], -; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], -; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], -; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer -; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer -; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer -; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1> +; AVX512-NEXT: [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1> +; AVX512-NEXT: [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1> +; AVX512-NEXT: [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1> +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8 +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 16 -; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 24 -; AVX512-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] -; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] -; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] -; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP23:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]] +; AVX512-NEXT: [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]] +; AVX512-NEXT: [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]] +; AVX512-NEXT: [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]] +; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]]) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 +; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]]) +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16 +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]]) +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16 -; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24 -; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66 +; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66 ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3311,14 +3295,14 @@ ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX512-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null +; AVX512-NEXT: [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]