diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2062,15 +2062,15 @@ if (OrigLoop->isLoopInvariant(V)) return V; - assert(Instance.Lane > 0 - ? !Cost->isUniformAfterVectorization(cast(V), VF) - : true && "Uniform values only have lane zero"); - + // Always use lane 0 for uniform values. + unsigned Lane = Instance.Lane; + if (Cost->isUniformAfterVectorization(cast(V), VF)) + Lane = 0; // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested // scalar value. - if (VectorLoopValueMap.hasScalarValue(V, Instance)) - return VectorLoopValueMap.getScalarValue(V, Instance); + if (VectorLoopValueMap.hasScalarValue(V, {Instance.Part, Lane})) + return VectorLoopValueMap.getScalarValue(V, {Instance.Part, Lane}); // If the value has not been scalarized, get its entry in VectorLoopValueMap // for the given unroll part. If this entry is not a vector type (i.e., the @@ -4671,11 +4671,11 @@ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); } - // Holds consecutive and consecutive-like pointers. Consecutive-like pointers - // are pointers that are treated like consecutive pointers during - // vectorization. The pointer operands of interleaved accesses are an - // example. - SmallSetVector ConsecutiveLikePtrs; + // Holds consecutive and consecutive-like pointers, as well as trivially loop + // invariant instructions. Consecutive-like pointers are pointers that are + // treated like consecutive pointers during vectorization. The pointer + // operands of interleaved accesses are an example. + SmallSetVector PotentialUniformRoots; // Holds pointer operands of instructions that are possibly non-uniform. SmallPtrSet PossibleNonUniformPtrs; @@ -4699,6 +4699,16 @@ // the getelementptr won't remain uniform. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { + // Instructions with loop invariant operands are uniform, as long as + // they do not read or write memory, are PHI nodes or terminators. + if (&I != BB->getTerminator() && !I.mayReadOrWriteMemory() && + !isa(&I) && all_of(I.operands(), [this](Use &U) { + return this->TheLoop->isLoopInvariant(U); + })) { + PotentialUniformRoots.insert(&I); + continue; + } + // If there's no pointer operand, there's nothing to do. auto *Ptr = dyn_cast_or_null(getLoadStorePointerOperand(&I)); if (!Ptr) @@ -4722,12 +4732,12 @@ // is consecutive-like, or interleaving - the pointer operand should // remain uniform. else - ConsecutiveLikePtrs.insert(Ptr); + PotentialUniformRoots.insert(Ptr); } // Add to the Worklist all consecutive and consecutive-like pointers that // aren't also identified as possibly non-uniform. - for (auto *V : ConsecutiveLikePtrs) + for (auto *V : PotentialUniformRoots) if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); Worklist.insert(V); @@ -4754,7 +4764,7 @@ auto *OI = cast(OV); if (llvm::all_of(OI->users(), [&](User *U) -> bool { auto *J = cast(U); - return Worklist.count(J) || + return Worklist.count(J) || TheLoop->isLoopInvariant(U) || (OI == getLoadStorePointerOperand(J) && isUniformDecision(J, VF)); })) { @@ -5481,8 +5491,6 @@ int LoopVectorizationCostModel::computePredInstDiscount( Instruction *PredInst, DenseMap &ScalarCosts, unsigned VF) { - assert(!isUniformAfterVectorization(PredInst, VF) && - "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the // vector version cost the same. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll @@ -9,8 +9,8 @@ ; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2. ; CM: LV: Scalar loop costs: 7. -; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0 -; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1 +; CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0 +; CM-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1 ; Check that the extractvalue operands are actually free in vector code. @@ -21,21 +21,19 @@ ; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, ; FORCED-NEXT: %0 = add i32 %index, 0 ; FORCED-NEXT: %1 = extractvalue { i64, i64 } %sv, 0 -; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 0 -; FORCED-NEXT: %3 = insertelement <2 x i64> undef, i64 %1, i32 0 -; FORCED-NEXT: %4 = insertelement <2 x i64> %3, i64 %2, i32 1 -; FORCED-NEXT: %5 = extractvalue { i64, i64 } %sv, 1 -; FORCED-NEXT: %6 = extractvalue { i64, i64 } %sv, 1 -; FORCED-NEXT: %7 = insertelement <2 x i64> undef, i64 %5, i32 0 -; FORCED-NEXT: %8 = insertelement <2 x i64> %7, i64 %6, i32 1 -; FORCED-NEXT: %9 = getelementptr i64, i64* %dst, i32 %0 -; FORCED-NEXT: %10 = add <2 x i64> %4, %8 -; FORCED-NEXT: %11 = getelementptr i64, i64* %9, i32 0 -; FORCED-NEXT: %12 = bitcast i64* %11 to <2 x i64>* -; FORCED-NEXT: store <2 x i64> %10, <2 x i64>* %12, align 4 +; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x i64> undef, i64 %1, i32 0 +; FORCED-NEXT: %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> undef, <2 x i32> zeroinitializer +; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 1 +; FORCED-NEXT: %broadcast.splatinsert3 = insertelement <2 x i64> undef, i64 %2, i32 0 +; FORCED-NEXT: %broadcast.splat4 = shufflevector <2 x i64> %broadcast.splatinsert3, <2 x i64> undef, <2 x i32> zeroinitializer +; FORCED-NEXT: %3 = getelementptr i64, i64* %dst, i32 %0 +; FORCED-NEXT: %4 = add <2 x i64> %broadcast.splat2, %broadcast.splat4 +; FORCED-NEXT: %5 = getelementptr i64, i64* %3, i32 0 +; FORCED-NEXT: %6 = bitcast i64* %5 to <2 x i64>* +; FORCED-NEXT: store <2 x i64> %4, <2 x i64>* %6, align 4 ; FORCED-NEXT: %index.next = add i32 %index, 2 -; FORCED-NEXT: %13 = icmp eq i32 %index.next, 0 -; FORCED-NEXT: br i1 %13, label %middle.block, label %vector.body, !llvm.loop !0 +; FORCED-NEXT: %7 = icmp eq i32 %index.next, 0 +; FORCED-NEXT: br i1 %7, label %middle.block, label %vector.body, !llvm.loop !0 define void @test1(i64* %dst, {i64, i64} %sv) { entry: @@ -61,8 +59,8 @@ declare float @pow(float, float) readnone nounwind ; CM: LV: Scalar loop costs: 16. -; CM: LV: Found an estimated cost of 5 for VF 2 For instruction: %a = extractvalue { float, float } %sv, 0 -; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction: %b = extractvalue { float, float } %sv, 1 +; CM: LV: Found an estimated cost of 1 for VF 2 For instruction: %a = extractvalue { float, float } %sv, 0 +; CM-NEXT: LV: Found an estimated cost of 1 for VF 2 For instruction: %b = extractvalue { float, float } %sv, 1 ; FORCED-LABEL: define void @test_getVectorCallCost @@ -73,21 +71,19 @@ ; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, ; FORCED-NEXT: %0 = add i32 %index, 0 ; FORCED-NEXT: %1 = extractvalue { float, float } %sv, 0 -; FORCED-NEXT: %2 = extractvalue { float, float } %sv, 0 -; FORCED-NEXT: %3 = insertelement <2 x float> undef, float %1, i32 0 -; FORCED-NEXT: %4 = insertelement <2 x float> %3, float %2, i32 1 -; FORCED-NEXT: %5 = extractvalue { float, float } %sv, 1 -; FORCED-NEXT: %6 = extractvalue { float, float } %sv, 1 -; FORCED-NEXT: %7 = insertelement <2 x float> undef, float %5, i32 0 -; FORCED-NEXT: %8 = insertelement <2 x float> %7, float %6, i32 1 -; FORCED-NEXT: %9 = getelementptr float, float* %dst, i32 %0 -; FORCED-NEXT: %10 = call <2 x float> @llvm.pow.v2f32(<2 x float> %4, <2 x float> %8) -; FORCED-NEXT: %11 = getelementptr float, float* %9, i32 0 -; FORCED-NEXT: %12 = bitcast float* %11 to <2 x float>* -; FORCED-NEXT: store <2 x float> %10, <2 x float>* %12, align 4 +; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x float> undef, float %1, i32 0 +; FORCED-NEXT: %broadcast.splat2 = shufflevector <2 x float> %broadcast.splatinsert1, <2 x float> undef, <2 x i32> zeroinitializer +; FORCED-NEXT: %2 = extractvalue { float, float } %sv, 1 +; FORCED-NEXT: %broadcast.splatinsert3 = insertelement <2 x float> undef, float %2, i32 0 +; FORCED-NEXT: %broadcast.splat4 = shufflevector <2 x float> %broadcast.splatinsert3, <2 x float> undef, <2 x i32> zeroinitializer +; FORCED-NEXT: %3 = getelementptr float, float* %dst, i32 %0 +; FORCED-NEXT: %4 = call <2 x float> @llvm.pow.v2f32(<2 x float> %broadcast.splat2, <2 x float> %broadcast.splat4) +; FORCED-NEXT: %5 = getelementptr float, float* %3, i32 0 +; FORCED-NEXT: %6 = bitcast float* %5 to <2 x float>* +; FORCED-NEXT: store <2 x float> %4, <2 x float>* %6, align 4 ; FORCED-NEXT: %index.next = add i32 %index, 2 -; FORCED-NEXT: %13 = icmp eq i32 %index.next, 0 -; FORCED-NEXT: br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4 +; FORCED-NEXT: %7 = icmp eq i32 %index.next, 0 +; FORCED-NEXT: br i1 %7, label %middle.block, label %vector.body, !llvm.loop !4 define void @test_getVectorCallCost(float* %dst, {float, float} %sv) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/assume.ll b/llvm/test/Transforms/LoopVectorize/X86/assume.ll --- a/llvm/test/Transforms/LoopVectorize/X86/assume.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/assume.ll @@ -66,18 +66,6 @@ ; CHECK: @llvm.assume ; CHECK: @llvm.assume ; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume -; CHECK: @llvm.assume ; CHECK: for.body: ; CHECK: ret void diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -23,15 +23,39 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>* -; CHECK-NEXT: store <2 x i16*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 0 to i64 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, <2 x i64> [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x %rec8*> [[TMP2]] to <2 x i16*> +; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16*, i16** [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16** [[TMP6]] to <2 x i16*>* +; CHECK-NEXT: store <2 x i16*> [[TMP3]], <2 x i16*>* [[TMP7]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2 +; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[BB1:%.*]] ] +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ] +; CHECK-NEXT: [[_TMP1:%.*]] = zext i16 0 to i64 +; CHECK-NEXT: [[_TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 [[_TMP1]] +; CHECK-NEXT: [[_TMP4:%.*]] = bitcast %rec8* [[_TMP2]] to i16* +; CHECK-NEXT: [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64 +; CHECK-NEXT: [[_TMP7:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[_TMP6]] +; CHECK-NEXT: store i16* [[_TMP4]], i16** [[_TMP7]] +; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1 +; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2 +; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop !2 +; CHECK: bb3: +; CHECK-NEXT: ret void +; bb1: br label %bb2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll @@ -18,16 +18,16 @@ ; CHECK: if.then: ; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> undef, i8 [[X:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* null, i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[P:%.*]], align 1, !tbaa !1 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1 ; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1 @@ -43,19 +43,25 @@ ; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* undef, align 1, !tbaa !1 ; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* undef, align 1, !tbaa !1 ; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* undef, align 1, !tbaa !1 -; CHECK-NEXT: [[TMP19:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i32 0 -; CHECK-NEXT: store i32 [[TMP21]], i32* undef, align 4, !tbaa !4 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i32 1 -; CHECK-NEXT: store i32 [[TMP22]], i32* undef, align 4, !tbaa !4 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i32 undef, 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[TMP19]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP14]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP21:%.*]] = zext i8 undef to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP21]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = or <4 x i32> [[TMP20]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP22]], i32 0 ; CHECK-NEXT: store i32 [[TMP23]], i32* undef, align 4, !tbaa !4 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP22]], i32 1 ; CHECK-NEXT: store i32 [[TMP24]], i32* undef, align 4, !tbaa !4 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP22]], i32 2 +; CHECK-NEXT: store i32 [[TMP25]], i32* undef, align 4, !tbaa !4 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3 +; CHECK-NEXT: store i32 [[TMP26]], i32* undef, align 4, !tbaa !4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1, 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]] @@ -66,11 +72,11 @@ ; CHECK-NEXT: [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[CONV70:%.*]] = zext i8 [[X]] to i32 ; CHECK-NEXT: [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24 -; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1 -; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP26]] to i32 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1 +; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP28]] to i32 ; CHECK-NEXT: [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16 ; CHECK-NEXT: [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]] -; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* undef, align 1, !tbaa !1 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* undef, align 1, !tbaa !1 ; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 undef, 8 ; CHECK-NEXT: [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]] ; CHECK-NEXT: [[CONV81:%.*]] = zext i8 undef to i32 diff --git a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll --- a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll @@ -2,6 +2,8 @@ target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" target triple = "i686-pc-windows-msvc18.0.0" +; Loop invariant call to @floor is uniform, which means we do not end up +; with any vector instructions in the loop. define void @test1() #0 personality i32 (...)* @__CxxFrameHandler3 { entry: invoke void @_CxxThrowException(i8* null, i8* null) @@ -33,7 +35,43 @@ ; CHECK-LABEL: define void @test1( ; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null] -; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ] +; CHECK: call double @floor(double 1.000000e+00) #1 [ "funclet"(token %1) ] + +define void @test2(double* %A) #0 personality i32 (...)* @__CxxFrameHandler3 { +entry: + invoke void @_CxxThrowException(i8* null, i8* null) + to label %unreachable unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch] unwind to caller + +catch: ; preds = %catch.dispatch + %1 = catchpad within %0 [i8* null, i32 64, i8* null] + br label %for.body + +for.cond.cleanup: ; preds = %for.body + catchret from %1 to label %try.cont + +for.body: ; preds = %for.body, %catch + %i.07 = phi i32 [ 0, %catch ], [ %inc, %for.body ] + %A.ptr = getelementptr double, double* %A, i32 %i.07 + %A.val = load double, double* %A.ptr + %call = call double @floor(double %A.val) #1 [ "funclet"(token %1) ] + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +try.cont: ; preds = %for.cond.cleanup + ret void + +unreachable: ; preds = %entry + unreachable +} + +; CHECK-LABEL: define void @test2( +; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null] +; CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %wide.load) [ "funclet"(token %1) ] + declare x86_stdcallcc void @_CxxThrowException(i8*, i8*) diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -25,23 +25,25 @@ ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT5]], <16 x i32*> undef, <16 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT9]], <16 x i32*> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT6]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32* [[A]], null +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i1> undef, i1 [[TMP3]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT7]], <16 x i1> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT6]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT6]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT10]], i32 4, <16 x i1> [[BROADCAST_SPLAT8]], <16 x i32> undef), !alias.scope !3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[BROADCAST_SPLAT8]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i32> [[PREDPHI]], i32 15 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -6,21 +6,50 @@ ; first test checks that loop with a reduction and a uniform store gets ; vectorized. -; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction -; CHECK-LABEL: vector.memcheck: -; CHECK: found.conflict - -; CHECK-LABEL: vector.body: -; CHECK: %vec.phi = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ] -; CHECK: %wide.load = load <16 x i32> -; CHECK: [[ADD]] = add <16 x i32> %vec.phi, %wide.load -; CHECK: store i32 %ntrunc, i32* %a -; CHECK-NOT: store i32 %ntrunc, i32* %a -; CHECK: %index.next = add i64 %index, 64 - -; CHECK-LABEL: middle.block: -; CHECK: %rdx.shuf = shufflevector <16 x i32> define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { +; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP10:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP11:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP12:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <16 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP13:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 32 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 48 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP10]] = add <16 x i32> [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP11]] = add <16 x i32> [[VEC_PHI8]], [[WIDE_LOAD11]] +; CHECK-NEXT: [[TMP12]] = add <16 x i32> [[VEC_PHI9]], [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP13]] = add <16 x i32> [[VEC_PHI10]], [[WIDE_LOAD13]] +; CHECK-NEXT: store i32 %ntrunc, i32* %a, align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], %n.vec +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop !5 +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <16 x i32> [[TMP12]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX15:%.*]] = add <16 x i32> [[TMP13]], [[BIN_RDX14]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[BIN_RDX15]], <16 x i32> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX16:%.*]] = add <16 x i32> [[BIN_RDX15]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF17:%.*]] = shufflevector <16 x i32> [[BIN_RDX16]], <16 x i32> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX18:%.*]] = add <16 x i32> [[BIN_RDX16]], [[RDX_SHUF17]] +; CHECK-NEXT: [[RDX_SHUF19:%.*]] = shufflevector <16 x i32> [[BIN_RDX18]], <16 x i32> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX20:%.*]] = add <16 x i32> [[BIN_RDX18]], [[RDX_SHUF19]] +; CHECK-NEXT: [[RDX_SHUF21:%.*]] = shufflevector <16 x i32> [[BIN_RDX20]], <16 x i32> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX22:%.*]] = add <16 x i32> [[BIN_RDX20]], [[RDX_SHUF21]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i32> [[BIN_RDX22]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 %smax, %n.vec +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label %scalar.ph + entry: %ntrunc = trunc i64 %n to i32 br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -537,10 +537,10 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE36:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE36]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP181:%.*]], [[PRED_LOAD_CONTINUE36]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP182:%.*]], [[PRED_LOAD_CONTINUE36]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP183:%.*]], [[PRED_LOAD_CONTINUE36]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP168:%.*]], [[PRED_LOAD_CONTINUE36]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP169:%.*]], [[PRED_LOAD_CONTINUE36]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP170:%.*]], [[PRED_LOAD_CONTINUE36]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP171:%.*]], [[PRED_LOAD_CONTINUE36]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], @@ -611,211 +611,199 @@ ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP64:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP65:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP67:%.*]] = bitcast i16* [[TMP66]] to i32* -; CHECK-NEXT: [[TMP68:%.*]] = load i32, i32* [[TMP67]] -; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i32> undef, i32 [[TMP68]], i32 0 +; CHECK-NEXT: [[TMP66:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP67:%.*]] = bitcast i32* [[BASE]] to i16* +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 +; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i16, i16* [[TMP64]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP70:%.*]] = bitcast i16* [[TMP69]] to i32* +; CHECK-NEXT: [[TMP71:%.*]] = load i32, i32* [[TMP70]] +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> undef, i32 [[TMP71]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP70:%.*]] = phi <4 x i32> [ undef, [[VECTOR_BODY]] ], [ [[TMP69]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 -; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; CHECK-NEXT: [[TMP73:%.*]] = phi <4 x i32> [ undef, [[VECTOR_BODY]] ], [ [[TMP72]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 +; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK: pred.load.if7: -; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i16, i16* [[TMP72]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP74:%.*]] = bitcast i16* [[TMP73]] to i32* -; CHECK-NEXT: [[TMP75:%.*]] = load i32, i32* [[TMP74]] -; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP75]], i32 1 +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i16, i16* [[TMP64]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP76:%.*]] = bitcast i16* [[TMP75]] to i32* +; CHECK-NEXT: [[TMP77:%.*]] = load i32, i32* [[TMP76]] +; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP77]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP77:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP76]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 -; CHECK-NEXT: br i1 [[TMP78]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; CHECK-NEXT: [[TMP79:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP78]], [[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 +; CHECK-NEXT: br i1 [[TMP80]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK: pred.load.if9: -; CHECK-NEXT: [[TMP79:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i16, i16* [[TMP79]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP81:%.*]] = bitcast i16* [[TMP80]] to i32* -; CHECK-NEXT: [[TMP82:%.*]] = load i32, i32* [[TMP81]] -; CHECK-NEXT: [[TMP83:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP82]], i32 2 +; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds i16, i16* [[TMP64]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP82:%.*]] = bitcast i16* [[TMP81]] to i32* +; CHECK-NEXT: [[TMP83:%.*]] = load i32, i32* [[TMP82]] +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> [[TMP79]], i32 [[TMP83]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP84:%.*]] = phi <4 x i32> [ [[TMP77]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP83]], [[PRED_LOAD_IF9]] ] -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 -; CHECK-NEXT: br i1 [[TMP85]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; CHECK-NEXT: [[TMP85:%.*]] = phi <4 x i32> [ [[TMP79]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP84]], [[PRED_LOAD_IF9]] ] +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 +; CHECK-NEXT: br i1 [[TMP86]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP86:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP86]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP64]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP88:%.*]] = bitcast i16* [[TMP87]] to i32* ; CHECK-NEXT: [[TMP89:%.*]] = load i32, i32* [[TMP88]] -; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP89]], i32 3 +; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP89]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP91:%.*]] = phi <4 x i32> [ [[TMP84]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP90]], [[PRED_LOAD_IF11]] ] +; CHECK-NEXT: [[TMP91:%.*]] = phi <4 x i32> [ [[TMP85]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP90]], [[PRED_LOAD_IF11]] ] ; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 ; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP93:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds i16, i16* [[TMP93]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP95:%.*]] = bitcast i16* [[TMP94]] to i32* -; CHECK-NEXT: [[TMP96:%.*]] = load i32, i32* [[TMP95]] -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> undef, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP94:%.*]] = bitcast i16* [[TMP93]] to i32* +; CHECK-NEXT: [[TMP95:%.*]] = load i32, i32* [[TMP94]] +; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> undef, i32 [[TMP95]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP97]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; CHECK-NEXT: [[TMP97:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP96]], [[PRED_LOAD_IF13]] ] +; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 +; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK: pred.load.if15: -; CHECK-NEXT: [[TMP100:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP101:%.*]] = getelementptr inbounds i16, i16* [[TMP100]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP102:%.*]] = bitcast i16* [[TMP101]] to i32* -; CHECK-NEXT: [[TMP103:%.*]] = load i32, i32* [[TMP102]] -; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP103]], i32 1 +; CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP100:%.*]] = bitcast i16* [[TMP99]] to i32* +; CHECK-NEXT: [[TMP101:%.*]] = load i32, i32* [[TMP100]] +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP97]], i32 [[TMP101]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK: pred.load.continue16: -; CHECK-NEXT: [[TMP105:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP104]], [[PRED_LOAD_IF15]] ] -; CHECK-NEXT: [[TMP106:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 -; CHECK-NEXT: br i1 [[TMP106]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; CHECK-NEXT: [[TMP103:%.*]] = phi <4 x i32> [ [[TMP97]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP102]], [[PRED_LOAD_IF15]] ] +; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 +; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK: pred.load.if17: -; CHECK-NEXT: [[TMP107:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds i16, i16* [[TMP107]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP109:%.*]] = bitcast i16* [[TMP108]] to i32* -; CHECK-NEXT: [[TMP110:%.*]] = load i32, i32* [[TMP109]] -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP105]], i32 [[TMP110]], i32 2 +; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP106:%.*]] = bitcast i16* [[TMP105]] to i32* +; CHECK-NEXT: [[TMP107:%.*]] = load i32, i32* [[TMP106]] +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> [[TMP103]], i32 [[TMP107]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK: pred.load.continue18: -; CHECK-NEXT: [[TMP112:%.*]] = phi <4 x i32> [ [[TMP105]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP111]], [[PRED_LOAD_IF17]] ] -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 -; CHECK-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; CHECK-NEXT: [[TMP109:%.*]] = phi <4 x i32> [ [[TMP103]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP108]], [[PRED_LOAD_IF17]] ] +; CHECK-NEXT: [[TMP110:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 +; CHECK-NEXT: br i1 [[TMP110]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK: pred.load.if19: -; CHECK-NEXT: [[TMP114:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i16, i16* [[TMP114]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP116:%.*]] = bitcast i16* [[TMP115]] to i32* -; CHECK-NEXT: [[TMP117:%.*]] = load i32, i32* [[TMP116]] -; CHECK-NEXT: [[TMP118:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP117]], i32 3 +; CHECK-NEXT: [[TMP111:%.*]] = getelementptr inbounds i16, i16* [[TMP65]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP112:%.*]] = bitcast i16* [[TMP111]] to i32* +; CHECK-NEXT: [[TMP113:%.*]] = load i32, i32* [[TMP112]] +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP113]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK: pred.load.continue20: -; CHECK-NEXT: [[TMP119:%.*]] = phi <4 x i32> [ [[TMP112]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP118]], [[PRED_LOAD_IF19]] ] -; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 -; CHECK-NEXT: br i1 [[TMP120]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; CHECK-NEXT: [[TMP115:%.*]] = phi <4 x i32> [ [[TMP109]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP114]], [[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 +; CHECK-NEXT: br i1 [[TMP116]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK: pred.load.if21: -; CHECK-NEXT: [[TMP121:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds i16, i16* [[TMP121]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP123:%.*]] = bitcast i16* [[TMP122]] to i32* -; CHECK-NEXT: [[TMP124:%.*]] = load i32, i32* [[TMP123]] -; CHECK-NEXT: [[TMP125:%.*]] = insertelement <4 x i32> undef, i32 [[TMP124]], i32 0 +; CHECK-NEXT: [[TMP117:%.*]] = getelementptr inbounds i16, i16* [[TMP66]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP118:%.*]] = bitcast i16* [[TMP117]] to i32* +; CHECK-NEXT: [[TMP119:%.*]] = load i32, i32* [[TMP118]] +; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> undef, i32 [[TMP119]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK: pred.load.continue22: -; CHECK-NEXT: [[TMP126:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP125]], [[PRED_LOAD_IF21]] ] -; CHECK-NEXT: [[TMP127:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 -; CHECK-NEXT: br i1 [[TMP127]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; CHECK-NEXT: [[TMP121:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP120]], [[PRED_LOAD_IF21]] ] +; CHECK-NEXT: [[TMP122:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 +; CHECK-NEXT: br i1 [[TMP122]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK: pred.load.if23: -; CHECK-NEXT: [[TMP128:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP129:%.*]] = getelementptr inbounds i16, i16* [[TMP128]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP130:%.*]] = bitcast i16* [[TMP129]] to i32* -; CHECK-NEXT: [[TMP131:%.*]] = load i32, i32* [[TMP130]] -; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP126]], i32 [[TMP131]], i32 1 +; CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds i16, i16* [[TMP66]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP124:%.*]] = bitcast i16* [[TMP123]] to i32* +; CHECK-NEXT: [[TMP125:%.*]] = load i32, i32* [[TMP124]] +; CHECK-NEXT: [[TMP126:%.*]] = insertelement <4 x i32> [[TMP121]], i32 [[TMP125]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK: pred.load.continue24: -; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP126]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP132]], [[PRED_LOAD_IF23]] ] -; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 -; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; CHECK-NEXT: [[TMP127:%.*]] = phi <4 x i32> [ [[TMP121]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP126]], [[PRED_LOAD_IF23]] ] +; CHECK-NEXT: [[TMP128:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 +; CHECK-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK: pred.load.if25: -; CHECK-NEXT: [[TMP135:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP136:%.*]] = getelementptr inbounds i16, i16* [[TMP135]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP137:%.*]] = bitcast i16* [[TMP136]] to i32* -; CHECK-NEXT: [[TMP138:%.*]] = load i32, i32* [[TMP137]] -; CHECK-NEXT: [[TMP139:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP138]], i32 2 +; CHECK-NEXT: [[TMP129:%.*]] = getelementptr inbounds i16, i16* [[TMP66]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP130:%.*]] = bitcast i16* [[TMP129]] to i32* +; CHECK-NEXT: [[TMP131:%.*]] = load i32, i32* [[TMP130]] +; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP127]], i32 [[TMP131]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK: pred.load.continue26: -; CHECK-NEXT: [[TMP140:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP139]], [[PRED_LOAD_IF25]] ] -; CHECK-NEXT: [[TMP141:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 -; CHECK-NEXT: br i1 [[TMP141]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP127]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP132]], [[PRED_LOAD_IF25]] ] +; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 +; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK: pred.load.if27: -; CHECK-NEXT: [[TMP142:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP143:%.*]] = getelementptr inbounds i16, i16* [[TMP142]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP144:%.*]] = bitcast i16* [[TMP143]] to i32* -; CHECK-NEXT: [[TMP145:%.*]] = load i32, i32* [[TMP144]] -; CHECK-NEXT: [[TMP146:%.*]] = insertelement <4 x i32> [[TMP140]], i32 [[TMP145]], i32 3 +; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i16, i16* [[TMP66]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP136:%.*]] = bitcast i16* [[TMP135]] to i32* +; CHECK-NEXT: [[TMP137:%.*]] = load i32, i32* [[TMP136]] +; CHECK-NEXT: [[TMP138:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP137]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK: pred.load.continue28: -; CHECK-NEXT: [[TMP147:%.*]] = phi <4 x i32> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP146]], [[PRED_LOAD_IF27]] ] -; CHECK-NEXT: [[TMP148:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 -; CHECK-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-NEXT: [[TMP139:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP138]], [[PRED_LOAD_IF27]] ] +; CHECK-NEXT: [[TMP140:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 +; CHECK-NEXT: br i1 [[TMP140]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] ; CHECK: pred.load.if29: -; CHECK-NEXT: [[TMP149:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP150:%.*]] = getelementptr inbounds i16, i16* [[TMP149]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP151:%.*]] = bitcast i16* [[TMP150]] to i32* -; CHECK-NEXT: [[TMP152:%.*]] = load i32, i32* [[TMP151]] -; CHECK-NEXT: [[TMP153:%.*]] = insertelement <4 x i32> undef, i32 [[TMP152]], i32 0 +; CHECK-NEXT: [[TMP141:%.*]] = getelementptr inbounds i16, i16* [[TMP67]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP142:%.*]] = bitcast i16* [[TMP141]] to i32* +; CHECK-NEXT: [[TMP143:%.*]] = load i32, i32* [[TMP142]] +; CHECK-NEXT: [[TMP144:%.*]] = insertelement <4 x i32> undef, i32 [[TMP143]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] ; CHECK: pred.load.continue30: -; CHECK-NEXT: [[TMP154:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP153]], [[PRED_LOAD_IF29]] ] -; CHECK-NEXT: [[TMP155:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 -; CHECK-NEXT: br i1 [[TMP155]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] +; CHECK-NEXT: [[TMP145:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP144]], [[PRED_LOAD_IF29]] ] +; CHECK-NEXT: [[TMP146:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 +; CHECK-NEXT: br i1 [[TMP146]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] ; CHECK: pred.load.if31: -; CHECK-NEXT: [[TMP156:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP157:%.*]] = getelementptr inbounds i16, i16* [[TMP156]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP158:%.*]] = bitcast i16* [[TMP157]] to i32* -; CHECK-NEXT: [[TMP159:%.*]] = load i32, i32* [[TMP158]] -; CHECK-NEXT: [[TMP160:%.*]] = insertelement <4 x i32> [[TMP154]], i32 [[TMP159]], i32 1 +; CHECK-NEXT: [[TMP147:%.*]] = getelementptr inbounds i16, i16* [[TMP67]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP148:%.*]] = bitcast i16* [[TMP147]] to i32* +; CHECK-NEXT: [[TMP149:%.*]] = load i32, i32* [[TMP148]] +; CHECK-NEXT: [[TMP150:%.*]] = insertelement <4 x i32> [[TMP145]], i32 [[TMP149]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] ; CHECK: pred.load.continue32: -; CHECK-NEXT: [[TMP161:%.*]] = phi <4 x i32> [ [[TMP154]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP160]], [[PRED_LOAD_IF31]] ] -; CHECK-NEXT: [[TMP162:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 -; CHECK-NEXT: br i1 [[TMP162]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] +; CHECK-NEXT: [[TMP151:%.*]] = phi <4 x i32> [ [[TMP145]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP150]], [[PRED_LOAD_IF31]] ] +; CHECK-NEXT: [[TMP152:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 +; CHECK-NEXT: br i1 [[TMP152]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] ; CHECK: pred.load.if33: -; CHECK-NEXT: [[TMP163:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP164:%.*]] = getelementptr inbounds i16, i16* [[TMP163]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP165:%.*]] = bitcast i16* [[TMP164]] to i32* -; CHECK-NEXT: [[TMP166:%.*]] = load i32, i32* [[TMP165]] -; CHECK-NEXT: [[TMP167:%.*]] = insertelement <4 x i32> [[TMP161]], i32 [[TMP166]], i32 2 +; CHECK-NEXT: [[TMP153:%.*]] = getelementptr inbounds i16, i16* [[TMP67]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP154:%.*]] = bitcast i16* [[TMP153]] to i32* +; CHECK-NEXT: [[TMP155:%.*]] = load i32, i32* [[TMP154]] +; CHECK-NEXT: [[TMP156:%.*]] = insertelement <4 x i32> [[TMP151]], i32 [[TMP155]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] ; CHECK: pred.load.continue34: -; CHECK-NEXT: [[TMP168:%.*]] = phi <4 x i32> [ [[TMP161]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP167]], [[PRED_LOAD_IF33]] ] -; CHECK-NEXT: [[TMP169:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 -; CHECK-NEXT: br i1 [[TMP169]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36]] +; CHECK-NEXT: [[TMP157:%.*]] = phi <4 x i32> [ [[TMP151]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP156]], [[PRED_LOAD_IF33]] ] +; CHECK-NEXT: [[TMP158:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 +; CHECK-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36]] ; CHECK: pred.load.if35: -; CHECK-NEXT: [[TMP170:%.*]] = bitcast i32* [[BASE]] to i16* -; CHECK-NEXT: [[TMP171:%.*]] = getelementptr inbounds i16, i16* [[TMP170]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP172:%.*]] = bitcast i16* [[TMP171]] to i32* -; CHECK-NEXT: [[TMP173:%.*]] = load i32, i32* [[TMP172]] -; CHECK-NEXT: [[TMP174:%.*]] = insertelement <4 x i32> [[TMP168]], i32 [[TMP173]], i32 3 +; CHECK-NEXT: [[TMP159:%.*]] = getelementptr inbounds i16, i16* [[TMP67]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP160:%.*]] = bitcast i16* [[TMP159]] to i32* +; CHECK-NEXT: [[TMP161:%.*]] = load i32, i32* [[TMP160]] +; CHECK-NEXT: [[TMP162:%.*]] = insertelement <4 x i32> [[TMP157]], i32 [[TMP161]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] ; CHECK: pred.load.continue36: -; CHECK-NEXT: [[TMP175:%.*]] = phi <4 x i32> [ [[TMP168]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP174]], [[PRED_LOAD_IF35]] ] -; CHECK-NEXT: [[TMP176:%.*]] = xor <4 x i1> [[TMP39]], -; CHECK-NEXT: [[TMP177:%.*]] = xor <4 x i1> [[TMP47]], -; CHECK-NEXT: [[TMP178:%.*]] = xor <4 x i1> [[TMP55]], -; CHECK-NEXT: [[TMP179:%.*]] = xor <4 x i1> [[TMP63]], +; CHECK-NEXT: [[TMP163:%.*]] = phi <4 x i32> [ [[TMP157]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP162]], [[PRED_LOAD_IF35]] ] +; CHECK-NEXT: [[TMP164:%.*]] = xor <4 x i1> [[TMP39]], +; CHECK-NEXT: [[TMP165:%.*]] = xor <4 x i1> [[TMP47]], +; CHECK-NEXT: [[TMP166:%.*]] = xor <4 x i1> [[TMP55]], +; CHECK-NEXT: [[TMP167:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP91]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI37:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP119]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI38:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP147]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI39:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP175]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP180]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP181]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI37]] -; CHECK-NEXT: [[TMP182]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI38]] -; CHECK-NEXT: [[TMP183]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI39]] +; CHECK-NEXT: [[PREDPHI37:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP115]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI38:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP139]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI39:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP163]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP168]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP169]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI37]] +; CHECK-NEXT: [[TMP170]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI38]] +; CHECK-NEXT: [[TMP171]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI39]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP184:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; CHECK-NEXT: br i1 [[TMP184]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; CHECK-NEXT: [[TMP172:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP172]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]] -; CHECK-NEXT: [[BIN_RDX40:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX41:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX40]] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP169]], [[TMP168]] +; CHECK-NEXT: [[BIN_RDX40:%.*]] = add <4 x i32> [[TMP170]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX41:%.*]] = add <4 x i32> [[TMP171]], [[BIN_RDX40]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX41]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX42:%.*]] = add <4 x i32> [[BIN_RDX41]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF43:%.*]] = shufflevector <4 x i32> [[BIN_RDX42]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[BIN_RDX44:%.*]] = add <4 x i32> [[BIN_RDX42]], [[RDX_SHUF43]] -; CHECK-NEXT: [[TMP185:%.*]] = extractelement <4 x i32> [[BIN_RDX44]], i32 0 +; CHECK-NEXT: [[TMP173:%.*]] = extractelement <4 x i32> [[BIN_RDX44]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP173]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -836,7 +824,7 @@ ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop !9 ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP173]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -336,8 +336,9 @@ ; UNROLL-NO-IC-LABEL: @constant_folded_previous_value( ; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , %vector.ph ], [ , %vector.body ] -; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> , <4 x i32> +; UNROLL-NO-IC: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ , %vector.ph ], [ %broadcast.splat4, %vector.body ] +; UNROLL-NO-IC-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 +; UNROLL-NO-IC-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer ; UNROLL-NO-IC: br i1 {{.*}}, label %middle.block, label %vector.body ; define void @constant_folded_previous_value() { diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll --- a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll @@ -375,21 +375,18 @@ ; CHECK-LABEL: non_uniform_live_out() ; CHECK-LABEL: vector.body: -; CHECK: %vec.ind = phi <2 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] -; CHECK: [[ADD:%[a-zA-Z0-9.]+]] = add <2 x i32> %vec.ind, -; CHECK: [[EE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 0 -; CHECK: [[GEP:%[a-zA-Z0-9.]+]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[EE]] +; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: [[ADD1:%[a-zA-Z0-9.]+]] = add i32 %index, 0 +; CHECK-NEXT: [[ADD2:%[a-zA-Z0-9.]+]] = add i32 [[ADD1]], 7 +; CHECK: [[GEP:%[a-zA-Z0-9.]+]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[ADD2]] ; CHECK-NEXT: [[GEP2:%[a-zA-Z0-9.]+]] = getelementptr inbounds i8, i8* [[GEP]], i32 0 ; CHECK-NEXT: [[BC:%[a-zA-Z0-9.]+]] = bitcast i8* [[GEP2]] to <2 x i8>* ; CHECK-NEXT: %wide.load = load <2 x i8>, <2 x i8>* [[BC]] -; CHECK-NEXT: [[ADD2:%[a-zA-Z0-9.]+]] = add <2 x i8> %wide.load, -; CHECK: store <2 x i8> [[ADD2]], <2 x i8>* - -; CHECK-LABEL: middle.block: -; CHECK: [[ADDEE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 1 +; CHECK-NEXT: [[ADD3:%[a-zA-Z0-9.]+]] = add <2 x i8> %wide.load, +; CHECK: store <2 x i8> [[ADD3]], <2 x i8>* ; CHECK-LABEL: for.end: -; CHECK: %lcssa = phi i32 [ %i.09, %for.body ], [ [[ADDEE]], %middle.block ] +; CHECK: %lcssa = phi i32 [ %i.09, %for.body ], [ [[ADD2]], %middle.block ] ; CHECK: %arrayidx.out = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %lcssa define i32 @non_uniform_live_out() { entry: diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll --- a/llvm/test/Transforms/LoopVectorize/pr32859.ll +++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -loop-vectorize -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -S | FileCheck --check-prefix=CM %s +; RUN: opt -force-vector-width=4 < %s -loop-vectorize -S | FileCheck --check-prefix=FORCE %s ; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]' ; but the IR Verifier requires for PHI one entry for each predecessor of @@ -6,9 +7,27 @@ ; added 'undef' for an predecessor BB and which is not correct. We copy the real ; value for another predecessor instead of bringing 'undef'. -; CHECK-LABEL: for.cond.preheader: -; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ] +; FORCE-LABEL: for.cond.preheader: +; FORCE-NEXT: %e.0.ph = phi i32 [ 0, %if.end.2.i ] +; Without forcing vectorization, we do not vectorize because we won't generate +; any vector instructions, besides the loop management code. +; CM-LABEL: entry: +; CM-NEXT: br label %for.cond1.preheader.i + +; CM-LABEL: for.cond1.preheader.i: +; CM-NEXT: %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ] +; CM-NEXT: %tobool.i = icmp ne i32 undef, 0 +; CM-NEXT: br label %if.end.2.i + +; CM-LABEL: if.end.2.i: +; CM-NEXT: %inc5.i = add nsw i32 %c.06.i, 1 +; CM-NEXT: %cmp.i = icmp slt i32 %inc5.i, 16 +; CM-NEXT: br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader + +; CM-LABEL: for.cond.preheader: +; CM-NEXT: %e.0.ph = phi i32 [ 0, %if.end.2.i ] +; CM-NEXT: unreachable ; Function Attrs: nounwind uwtable define void @main() #0 { entry: diff --git a/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll b/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll --- a/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll @@ -1,18 +1,18 @@ ; RUN: opt -S -loop-vectorize -force-vector-width=4 %s | FileCheck %s -; CHECK-LABEL: @test_fshl +; CHECK-LABEL: @test_fshl_invariant ; CHECK-LABEL: vector.body: ; CHECK-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 ; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: %induction = add <4 x i32> %broadcast.splat, ; CHECK-NEXT: %0 = add i32 %index, 0 -; CHECK-NEXT: %1 = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i16> ) +; CHECK-NEXT: %1 = tail call i16 @llvm.fshl.i16(i16 undef, i16 undef, i16 15) ; CHECK-NEXT: %index.next = add i32 %index, 4 ; CHECK-NEXT: %2 = icmp eq i32 %index.next, %n.vec ; CHECK-NEXT: br i1 %2, label %middle.block, label %vector.body, !llvm.loop !0 ; -define void @test_fshl(i32 %width) { +define void @test_fshl_invariant(i32 %width) { entry: br label %for.body9.us.us @@ -28,3 +28,36 @@ } declare i16 @llvm.fshl.i16(i16, i16, i16) + +; CHECK-LABEL: @test_fshl( +; CHECK-LABEL: vector.body: + ; CHECK-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 +; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: %induction = add <4 x i32> %broadcast.splat, +; CHECK-NEXT: %8 = add i32 %index, 0 +; CHECK-NEXT: %9 = getelementptr i16, i16* %A, i32 %8 +; CHECK-NEXT: %10 = getelementptr i16, i16* %9, i32 0 +; CHECK-NEXT: %11 = bitcast i16* %10 to <4 x i16>* +; CHECK-NEXT: %wide.load = load <4 x i16>, <4 x i16>* %11, align 2 +; CHECK-NEXT: %12 = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %wide.load, <4 x i16> %wide.load, <4 x i16> ) +; CHECK-NEXT: %index.next = add i32 %index, 4 +; CHECK-NEXT: %13 = icmp eq i32 %index.next, %n.vec +; CHECK-NEXT: br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4 + +define void @test_fshl(i32 %width, i16* %A) { +entry: + br label %for.body9.us.us + +for.cond6.for.cond.cleanup8_crit_edge.us.us: ; preds = %for.body9.us.us + ret void + +for.body9.us.us: ; preds = %for.body9.us.us, %entry + %x.020.us.us = phi i32 [ 0, %entry ], [ %inc.us.us, %for.body9.us.us ] + %A.ptr = getelementptr i16, i16* %A, i32 %x.020.us.us + %a = load i16, i16* %A.ptr + %conv4.i.us.us = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 15) + %inc.us.us = add nuw i32 %x.020.us.us, 1 + %exitcond50 = icmp eq i32 %inc.us.us, %width + br i1 %exitcond50, label %for.cond6.for.cond.cleanup8_crit_edge.us.us, label %for.body9.us.us +}