Index: llvm/lib/Analysis/ConstantFolding.cpp =================================================================== --- llvm/lib/Analysis/ConstantFolding.cpp +++ llvm/lib/Analysis/ConstantFolding.cpp @@ -1428,6 +1428,15 @@ case Intrinsic::smul_fix_sat: case Intrinsic::bitreverse: case Intrinsic::is_constant: + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_umax: return true; // Floating point operations cannot be folded in strictfp functions in @@ -1644,6 +1653,52 @@ return GetConstantFoldFPValue(V, Ty); } +Constant *ConstantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) { + if (!isa(Op->getType())) + return nullptr; + FixedVectorType *VT = cast(Op->getType()); + if (!isa(Op->getAggregateElement(0U))) + return nullptr; + APInt Acc = cast(Op->getAggregateElement(0U))->getValue(); + + for (unsigned I = 1; I < VT->getNumElements(); I++) { + if (!isa(Op->getAggregateElement(I))) + return nullptr; + const APInt &X = cast(Op->getAggregateElement(I))->getValue(); + switch (IID) { + case Intrinsic::experimental_vector_reduce_add: + Acc = Acc + X; + break; + case Intrinsic::experimental_vector_reduce_mul: + Acc = Acc * X; + break; + case Intrinsic::experimental_vector_reduce_and: + Acc = Acc & X; + break; + case Intrinsic::experimental_vector_reduce_or: + Acc = Acc | X; + break; + case Intrinsic::experimental_vector_reduce_xor: + Acc = Acc ^ X; + break; + case Intrinsic::experimental_vector_reduce_smin: + Acc = APIntOps::smin(Acc, X); + break; + case Intrinsic::experimental_vector_reduce_smax: + Acc = APIntOps::smax(Acc, X); + break; + case Intrinsic::experimental_vector_reduce_umin: + Acc = APIntOps::umin(Acc, X); + break; + case Intrinsic::experimental_vector_reduce_umax: + Acc = APIntOps::umax(Acc, X); + break; + } + } + + return ConstantInt::get(Op->getContext(), Acc); +} + /// Attempt to fold an SSE floating point to integer conversion of a constant /// floating point. If roundTowardZero is false, the default IEEE rounding is /// used (toward nearest, ties to even). This matches the behavior of the @@ -2078,12 +2133,40 @@ } } + if (isa(Operands[0])) { + switch (IntrinsicID) { + default: break; + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_umax: + return ConstantInt::get(Ty, 0); + } + } + // Support ConstantVector in case we have an Undef in the top. if (isa(Operands[0]) || isa(Operands[0])) { auto *Op = cast(Operands[0]); switch (IntrinsicID) { default: break; + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_umax: + if (Constant *C = ConstantFoldVectorReduce(IntrinsicID, Op)) + return C; + break; case Intrinsic::x86_sse_cvtss2si: case Intrinsic::x86_sse_cvtss2si64: case Intrinsic::x86_sse2_cvtsd2si: Index: llvm/test/Analysis/ConstantFolding/vecreduce.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/ConstantFolding/vecreduce.ll @@ -0,0 +1,399 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -constprop -S | FileCheck %s + +declare i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.mul.v1i32(<1 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.or.v1i32(<1 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.xor.v1i32(<1 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.smin.v1i32(<1 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.smax.v1i32(<1 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.umin.v1i32(<1 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> %a) +declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a) + + +define i32 @add_0() { +; CHECK-LABEL: @add_0( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> zeroinitializer) + ret i32 %x +} + +define i32 @add_1() { +; CHECK-LABEL: @add_1( +; CHECK-NEXT: ret i32 8 +; + %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @add_inc() { +; CHECK-LABEL: @add_inc( +; CHECK-NEXT: ret i32 18 +; + %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @add_1v() { +; CHECK-LABEL: @add_1v( +; CHECK-NEXT: ret i32 10 +; + %x = call i32 @llvm.experimental.vector.reduce.add.v1i32(<1 x i32> ) + ret i32 %x +} + +define i32 @add_undef() { +; CHECK-LABEL: @add_undef( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> ) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> ) + ret i32 %x +} + + +define i32 @mul_0() { +; CHECK-LABEL: @mul_0( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> zeroinitializer) + ret i32 %x +} + +define i32 @mul_1() { +; CHECK-LABEL: @mul_1( +; CHECK-NEXT: ret i32 1 +; + %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @mul_inc() { +; CHECK-LABEL: @mul_inc( +; CHECK-NEXT: ret i32 40320 +; + %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @mul_1v() { +; CHECK-LABEL: @mul_1v( +; CHECK-NEXT: ret i32 10 +; + %x = call i32 @llvm.experimental.vector.reduce.mul.v1i32(<1 x i32> ) + ret i32 %x +} + +define i32 @mul_undef() { +; CHECK-LABEL: @mul_undef( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> ) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> ) + ret i32 %x +} + + +define i32 @and_0() { +; CHECK-LABEL: @and_0( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> zeroinitializer) + ret i32 %x +} + +define i32 @and_1() { +; CHECK-LABEL: @and_1( +; CHECK-NEXT: ret i32 1 +; + %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @and_inc() { +; CHECK-LABEL: @and_inc( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @and_1v() { +; CHECK-LABEL: @and_1v( +; CHECK-NEXT: ret i32 10 +; + %x = call i32 @llvm.experimental.vector.reduce.and.v1i32(<1 x i32> ) + ret i32 %x +} + +define i32 @and_undef() { +; CHECK-LABEL: @and_undef( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> ) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> ) + ret i32 %x +} + + +define i32 @or_0() { +; CHECK-LABEL: @or_0( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> zeroinitializer) + ret i32 %x +} + +define i32 @or_1() { +; CHECK-LABEL: @or_1( +; CHECK-NEXT: ret i32 1 +; + %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @or_inc() { +; CHECK-LABEL: @or_inc( +; CHECK-NEXT: ret i32 -1 +; + %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @or_1v() { +; CHECK-LABEL: @or_1v( +; CHECK-NEXT: ret i32 10 +; + %x = call i32 @llvm.experimental.vector.reduce.or.v1i32(<1 x i32> ) + ret i32 %x +} + +define i32 @or_undef() { +; CHECK-LABEL: @or_undef( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> ) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> ) + ret i32 %x +} + + +define i32 @xor_0() { +; CHECK-LABEL: @xor_0( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> zeroinitializer) + ret i32 %x +} + +define i32 @xor_1() { +; CHECK-LABEL: @xor_1( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @xor_inc() { +; CHECK-LABEL: @xor_inc( +; CHECK-NEXT: ret i32 10 +; + %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @xor_1v() { +; CHECK-LABEL: @xor_1v( +; CHECK-NEXT: ret i32 10 +; + %x = call i32 @llvm.experimental.vector.reduce.xor.v1i32(<1 x i32> ) + ret i32 %x +} + +define i32 @xor_undef() { +; CHECK-LABEL: @xor_undef( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> ) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> ) + ret i32 %x +} + + +define i32 @smin_0() { +; CHECK-LABEL: @smin_0( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> zeroinitializer) + ret i32 %x +} + +define i32 @smin_1() { +; CHECK-LABEL: @smin_1( +; CHECK-NEXT: ret i32 1 +; + %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @smin_inc() { +; CHECK-LABEL: @smin_inc( +; CHECK-NEXT: ret i32 -6 +; + %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @smin_1v() { +; CHECK-LABEL: @smin_1v( +; CHECK-NEXT: ret i32 10 +; + %x = call i32 @llvm.experimental.vector.reduce.smin.v1i32(<1 x i32> ) + ret i32 %x +} + +define i32 @smin_undef() { +; CHECK-LABEL: @smin_undef( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> ) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> ) + ret i32 %x +} + + +define i32 @smax_0() { +; CHECK-LABEL: @smax_0( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> zeroinitializer) + ret i32 %x +} + +define i32 @smax_1() { +; CHECK-LABEL: @smax_1( +; CHECK-NEXT: ret i32 1 +; + %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @smax_inc() { +; CHECK-LABEL: @smax_inc( +; CHECK-NEXT: ret i32 8 +; + %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @smax_1v() { +; CHECK-LABEL: @smax_1v( +; CHECK-NEXT: ret i32 10 +; + %x = call i32 @llvm.experimental.vector.reduce.smax.v1i32(<1 x i32> ) + ret i32 %x +} + +define i32 @smax_undef() { +; CHECK-LABEL: @smax_undef( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> ) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> ) + ret i32 %x +} + + +define i32 @umin_0() { +; CHECK-LABEL: @umin_0( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> zeroinitializer) + ret i32 %x +} + +define i32 @umin_1() { +; CHECK-LABEL: @umin_1( +; CHECK-NEXT: ret i32 1 +; + %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @umin_inc() { +; CHECK-LABEL: @umin_inc( +; CHECK-NEXT: ret i32 1 +; + %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @umin_1v() { +; CHECK-LABEL: @umin_1v( +; CHECK-NEXT: ret i32 10 +; + %x = call i32 @llvm.experimental.vector.reduce.umin.v1i32(<1 x i32> ) + ret i32 %x +} + +define i32 @umin_undef() { +; CHECK-LABEL: @umin_undef( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> ) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> ) + ret i32 %x +} + + +define i32 @umax_0() { +; CHECK-LABEL: @umax_0( +; CHECK-NEXT: ret i32 0 +; + %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> zeroinitializer) + ret i32 %x +} + +define i32 @umax_1() { +; CHECK-LABEL: @umax_1( +; CHECK-NEXT: ret i32 1 +; + %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @umax_inc() { +; CHECK-LABEL: @umax_inc( +; CHECK-NEXT: ret i32 -3 +; + %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> ) + ret i32 %x +} + +define i32 @umax_1v() { +; CHECK-LABEL: @umax_1v( +; CHECK-NEXT: ret i32 10 +; + %x = call i32 @llvm.experimental.vector.reduce.umax.v1i32(<1 x i32> ) + ret i32 %x +} + +define i32 @umax_undef() { +; CHECK-LABEL: @umax_undef( +; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> ) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> ) + ret i32 %x +} Index: llvm/test/Transforms/LoopVectorize/reduction-inloop.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -117,17 +117,16 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> ) -; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], 12 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -135,7 +134,7 @@ ; CHECK: .lr.ph: ; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7 ; CHECK: ._crit_edge: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; entry: @@ -1072,19 +1071,18 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 255, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> ) -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[INDEX]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: [[TMP7]] = and i32 [[TMP6]], [[TMP1]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 255, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = and i32 [[VEC_PHI]], 255 +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP6]] = and i32 [[TMP5]], [[TMP0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !40 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !40 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1092,7 +1090,7 @@ ; CHECK: .lr.ph: ; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !41 ; CHECK: ._crit_edge: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[SUM_0_LCSSA]] to i8 ; CHECK-NEXT: ret i8 [[RET]] ;