diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -5644,54 +5644,74 @@ } const Value *llvm::getUnderlyingObject(const Value *V, unsigned MaxLookup) { - std::function Visit = - [&](const Value *Root, unsigned Count) { - const Value *V = Root; - if (!V->getType()->isPointerTy() || - (MaxLookup != 0 && Count == MaxLookup)) + unsigned Count = 0; + DenseMap Visited; + + std::function Visit = [&](const Value *Root) { + const Value *V = Root; + if (Visited.contains(V)) + return Visited.at(V); + if (!V->getType()->isPointerTy() || (MaxLookup != 0 && Count == MaxLookup)) + return V; + Count += 1; + + if (auto *GEP = dyn_cast(V)) { + V = GEP->getPointerOperand(); + } else if (Operator::getOpcode(V) == Instruction::BitCast || + Operator::getOpcode(V) == Instruction::AddrSpaceCast) { + V = cast(V)->getOperand(0); + if (!V->getType()->isPointerTy()) + return V; + } else if (auto *GA = dyn_cast(V)) { + if (GA->isInterposable()) + return V; + V = GA->getAliasee(); + } else { + if (auto *PHI = dyn_cast(V)) { + Visited[V] = V; + if (PHI->getNumIncomingValues() == 0) return V; - if (auto *GEP = dyn_cast(V)) { - V = GEP->getPointerOperand(); - } else if (Operator::getOpcode(V) == Instruction::BitCast || - Operator::getOpcode(V) == Instruction::AddrSpaceCast) { - V = cast(V)->getOperand(0); - if (!V->getType()->isPointerTy()) - return V; - } else if (auto *GA = dyn_cast(V)) { - if (GA->isInterposable()) + // We can look through Phi's if each incoming value has the same + // underlying object, or is the phi itself. + const Value *NewUnderlying = Visit(PHI->getIncomingValue(0)); + for (unsigned I = 1; I < PHI->getNumIncomingValues(); ++I) { + const Value *IncomingUnderlying = Visit(PHI->getIncomingValue(I)); + if (IncomingUnderlying == V || IncomingUnderlying == NewUnderlying) + continue; + if (NewUnderlying == V) + // Found a new possible underlying object. + NewUnderlying = IncomingUnderlying; + else // IncomingUnderlying != NewUnderlying + // There are >=2 possible underlying objects. We cannot + // determine a new underlying object. return V; - V = GA->getAliasee(); - } else { - if (auto *PHI = dyn_cast(V)) { - // Look through single-arg phi nodes created by LCSSA. - if (PHI->getNumIncomingValues() == 1) { - V = PHI->getIncomingValue(0); - return Visit(V, Count + 1); - } - } else if (auto *Call = dyn_cast(V)) { - // CaptureTracking can know about special capturing properties of - // some intrinsics like launder.invariant.group, that can't be - // expressed with the attributes, but have properties like returning - // aliasing pointer. Because some analysis may assume that - // nocaptured pointer is not returned from some special intrinsic - // (because function would have to be marked with returns - // attribute), it is crucial to use this function because it should - // be in sync with CaptureTracking. Not using it may cause weird - // miscompilations where 2 aliasing pointers are assumed to noalias. - if (auto *RP = getArgumentAliasingToReturnedPointer(Call, false)) { - V = RP; - return Visit(RP, Count + 1); - } - } - - return V; } - assert(V->getType()->isPointerTy() && "Unexpected operand type!"); - return Visit(V, Count + 1); - }; + V = NewUnderlying; + } else if (auto *Call = dyn_cast(V)) { + // CaptureTracking can know about special capturing properties of + // some intrinsics like launder.invariant.group, that can't be + // expressed with the attributes, but have properties like returning + // aliasing pointer. Because some analysis may assume that + // nocaptured pointer is not returned from some special intrinsic + // (because function would have to be marked with returns + // attribute), it is crucial to use this function because it should + // be in sync with CaptureTracking. Not using it may cause weird + // miscompilations where 2 aliasing pointers are assumed to noalias. + if (auto *RP = getArgumentAliasingToReturnedPointer(Call, false)) { + V = RP; + return Visit(RP); + } + } + Visited[Root] = V; + return V; + } + assert(V->getType()->isPointerTy() && "Unexpected operand type!"); + Visited[Root] = V; + return Visit(V); + }; - return Visit(V, 0); + return Visit(V); } void llvm::getUnderlyingObjects(const Value *V, diff --git a/llvm/test/Analysis/BasicAA/phi-aa.ll b/llvm/test/Analysis/BasicAA/phi-aa.ll --- a/llvm/test/Analysis/BasicAA/phi-aa.ll +++ b/llvm/test/Analysis/BasicAA/phi-aa.ll @@ -82,7 +82,7 @@ ; actually only one underlying value. ; FIXME: All of these could be NoAlias. ; CHECK-LABEL: loop_phi_chain -; CHECK: MayAlias: i32* %val1, i32* @Y +; CHECK: NoAlias: i32* %val1, i32* @Y ; CHECK: MayAlias: i32* %val2, i32* @Y ; CHECK: MayAlias: i32* %val3, i32* @Y define void @loop_phi_chain(i32 %a, i32 %b, i32 %c) { diff --git a/llvm/test/Analysis/BasicAA/recphi.ll b/llvm/test/Analysis/BasicAA/recphi.ll --- a/llvm/test/Analysis/BasicAA/recphi.ll +++ b/llvm/test/Analysis/BasicAA/recphi.ll @@ -41,11 +41,11 @@ ; CHECK: MustAlias: i32* %tab, [2 x i32]* %tab ; CHECK: MustAlias: i32* %tab, i8* %tab ; CHECK: NoAlias: i32* %arrayidx, i32* %tab -; CHECK: MayAlias: i32* %incdec.ptr.i, [2 x i32]* %tab +; CHECK: PartialAlias: i32* %incdec.ptr.i, [2 x i32]* %tab ; CHECK: NoAlias: i32* %incdec.ptr.i, i8* %tab ; CHECK: MayAlias: i32* %arrayidx, i32* %incdec.ptr.i ; CHECK: NoAlias: i32* %incdec.ptr.i, i32* %tab -; CHECK: MayAlias: i32* %p.addr.05.i, [2 x i32]* %tab +; CHECK: PartialAlias: i32* %p.addr.05.i, [2 x i32]* %tab ; CHECK: MayAlias: i32* %p.addr.05.i, i8* %tab ; CHECK: MayAlias: i32* %arrayidx, i32* %p.addr.05.i ; CHECK: MayAlias: i32* %p.addr.05.i, i32* %tab @@ -95,11 +95,11 @@ ; CHECK: PartialAlias (off -36): i32* %arrayidx1, [10 x i32]* %tab ; CHECK: NoAlias: i32* %arrayidx1, i8* %tab ; CHECK: NoAlias: i32* %arrayidx1, i32* %tab -; CHECK: MayAlias: i32* %incdec.ptr.i, [10 x i32]* %tab +; CHECK: PartialAlias: i32* %incdec.ptr.i, [10 x i32]* %tab ; CHECK: MayAlias: i32* %incdec.ptr.i, i8* %tab ; CHECK: MayAlias: i32* %incdec.ptr.i, i32* %tab ; CHECK: MayAlias: i32* %arrayidx1, i32* %incdec.ptr.i -; CHECK: MayAlias: i32* %p.addr.05.i, [10 x i32]* %tab +; CHECK: PartialAlias: i32* %p.addr.05.i, [10 x i32]* %tab ; CHECK: MayAlias: i32* %p.addr.05.i, i8* %tab ; CHECK: MayAlias: i32* %p.addr.05.i, i32* %tab ; CHECK: MayAlias: i32* %arrayidx1, i32* %p.addr.05.i @@ -144,9 +144,9 @@ ; CHECK-LABEL: Function: negative: 5 pointers, 1 call sites ; CHECK: PartialAlias (off -4): i16* %_tmp1, [3 x i16]* %int_arr.10 -; CHECK: MayAlias: [3 x i16]* %int_arr.10, i16* %ls1.9.0 +; CHECK: PartialAlias: [3 x i16]* %int_arr.10, i16* %ls1.9.0 ; CHECK: MayAlias: i16* %_tmp1, i16* %ls1.9.0 -; CHECK: MayAlias: i16* %_tmp7, [3 x i16]* %int_arr.10 +; CHECK: PartialAlias: i16* %_tmp7, [3 x i16]* %int_arr.10 ; CHECK: MayAlias: i16* %_tmp1, i16* %_tmp7 ; CHECK: NoAlias: i16* %_tmp7, i16* %ls1.9.0 ; CHECK: PartialAlias (off -2): i16* %_tmp11, [3 x i16]* %int_arr.10 @@ -155,8 +155,8 @@ ; CHECK: MayAlias: i16* %_tmp11, i16* %_tmp7 ; CHECK: NoModRef: Ptr: [3 x i16]* %int_arr.10 <-> %_tmp16 = call i16 @call(i32 %_tmp13) ; CHECK: NoModRef: Ptr: i16* %_tmp1 <-> %_tmp16 = call i16 @call(i32 %_tmp13) -; CHECK: Both ModRef: Ptr: i16* %ls1.9.0 <-> %_tmp16 = call i16 @call(i32 %_tmp13) -; CHECK: Both ModRef: Ptr: i16* %_tmp7 <-> %_tmp16 = call i16 @call(i32 %_tmp13) +; CHECK: NoModRef: Ptr: i16* %ls1.9.0 <-> %_tmp16 = call i16 @call(i32 %_tmp13) +; CHECK: NoModRef: Ptr: i16* %_tmp7 <-> %_tmp16 = call i16 @call(i32 %_tmp13) ; CHECK: NoModRef: Ptr: i16* %_tmp11 <-> %_tmp16 = call i16 @call(i32 %_tmp13) define i16 @negative(i16 %argc.5.par) { %int_arr.10 = alloca [3 x i16], align 1 @@ -282,7 +282,7 @@ ; CHECK: NoAlias: i8* %a, i8* %p.base ; CHECK: NoAlias: i8* %a, i8* %p.outer ; CHECK: NoAlias: i8* %a, i8* %p.outer.next -; CHECK: MayAlias: i8* %a, i8* %p.inner +; CHECK: NoAlias: i8* %a, i8* %p.inner ; CHECK: NoAlias: i8* %a, i8* %p.inner.next ; TODO: (a, p.inner) could be NoAlias define void @nested_loop2(i1 %c, i1 %c2, ptr noalias %p.base) { @@ -351,7 +351,7 @@ ; CHECK: NoAlias: i8* %a, i8* %p.base ; CHECK: NoAlias: i8* %a, i8* %p1 ; CHECK: NoAlias: i8* %a, i8* %p1.next -; CHECK: MayAlias: i8* %a, i8* %p2 +; CHECK: NoAlias: i8* %a, i8* %p2 ; CHECK: NoAlias: i8* %a, i8* %p2.next ; TODO: %p2 does not alias %a define void @sibling_loop(i1 %c, i1 %c2, ptr noalias %p.base) { diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -424,8 +424,8 @@ ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: b .LBB5_7 ; CHECK-NEXT: .LBB5_3: -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: b .LBB5_9 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret ; CHECK-NEXT: .LBB5_4: // %vector.ph ; CHECK-NEXT: and x11, x10, #0xfffffff0 ; CHECK-NEXT: add x8, x0, #8 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll @@ -128,12 +128,9 @@ ; } ; } -; FIXME: This should be promotable. We need to use -; getUnderlyingObjects when looking at the icmp user. - ; CHECK-LABEL: @ptr_induction_var_same_alloca( -; CHECK: %alloca = alloca [64 x i32], align 4 -; CHECK: phi ptr addrspace(5) [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] +; CHECK-NOT: {{.*}} = alloca +; CHECK: phi ptr addrspace(3) [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 { entry: %alloca = alloca [64 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll @@ -17,8 +17,6 @@ define fastcc ptr @wrongUseOfPostDominate(ptr readonly %s, i32 %off, ptr readnone %lim) { ; ENABLE-LABEL: wrongUseOfPostDominate: ; ENABLE: @ %bb.0: @ %entry -; ENABLE-NEXT: .save {r11, lr} -; ENABLE-NEXT: push {r11, lr} ; ENABLE-NEXT: cmn r1, #1 ; ENABLE-NEXT: ble .LBB0_7 ; ENABLE-NEXT: @ %bb.1: @ %while.cond.preheader @@ -26,8 +24,8 @@ ; ENABLE-NEXT: beq .LBB0_6 ; ENABLE-NEXT: @ %bb.2: @ %while.cond.preheader ; ENABLE-NEXT: cmp r0, r2 -; ENABLE-NEXT: pophs {r11, pc} -; ENABLE-NEXT: .LBB0_3: @ %while.body.preheader +; ENABLE-NEXT: bhs .LBB0_6 +; ENABLE-NEXT: @ %bb.3: @ %while.body.preheader ; ENABLE-NEXT: movw r12, :lower16:skip ; ENABLE-NEXT: sub r1, r1, #1 ; ENABLE-NEXT: movt r12, :upper16:skip @@ -38,70 +36,73 @@ ; ENABLE-NEXT: add r0, r0, r3 ; ENABLE-NEXT: sub r3, r1, #1 ; ENABLE-NEXT: cmp r3, r1 -; ENABLE-NEXT: bhs .LBB0_6 -; ENABLE-NEXT: @ %bb.5: @ %while.body +; ENABLE-NEXT: bxhs lr +; ENABLE-NEXT: .LBB0_5: @ %while.body ; ENABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLE-NEXT: cmp r0, r2 ; ENABLE-NEXT: mov r1, r3 ; ENABLE-NEXT: blo .LBB0_4 ; ENABLE-NEXT: .LBB0_6: @ %if.end29 -; ENABLE-NEXT: pop {r11, pc} -; ENABLE-NEXT: .LBB0_7: @ %while.cond2.outer +; ENABLE-NEXT: bx lr +; ENABLE-NEXT: .LBB0_7: +; ENABLE-NEXT: .save {r11, lr} +; ENABLE-NEXT: push {r11, lr} +; ENABLE-NEXT: .LBB0_8: @ %while.cond2.outer ; ENABLE-NEXT: @ =>This Loop Header: Depth=1 -; ENABLE-NEXT: @ Child Loop BB0_8 Depth 2 -; ENABLE-NEXT: @ Child Loop BB0_15 Depth 2 +; ENABLE-NEXT: @ Child Loop BB0_9 Depth 2 +; ENABLE-NEXT: @ Child Loop BB0_16 Depth 2 ; ENABLE-NEXT: mov r3, r0 -; ENABLE-NEXT: .LBB0_8: @ %while.cond2 -; ENABLE-NEXT: @ Parent Loop BB0_7 Depth=1 +; ENABLE-NEXT: .LBB0_9: @ %while.cond2 +; ENABLE-NEXT: @ Parent Loop BB0_8 Depth=1 ; ENABLE-NEXT: @ => This Inner Loop Header: Depth=2 ; ENABLE-NEXT: add r1, r1, #1 ; ENABLE-NEXT: cmp r1, #1 -; ENABLE-NEXT: beq .LBB0_18 -; ENABLE-NEXT: @ %bb.9: @ %while.body4 -; ENABLE-NEXT: @ in Loop: Header=BB0_8 Depth=2 +; ENABLE-NEXT: beq .LBB0_19 +; ENABLE-NEXT: @ %bb.10: @ %while.body4 +; ENABLE-NEXT: @ in Loop: Header=BB0_9 Depth=2 ; ENABLE-NEXT: cmp r3, r2 -; ENABLE-NEXT: bls .LBB0_8 -; ENABLE-NEXT: @ %bb.10: @ %if.then7 -; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 +; ENABLE-NEXT: bls .LBB0_9 +; ENABLE-NEXT: @ %bb.11: @ %if.then7 +; ENABLE-NEXT: @ in Loop: Header=BB0_8 Depth=1 ; ENABLE-NEXT: mov r0, r3 ; ENABLE-NEXT: ldrb r12, [r0, #-1]! ; ENABLE-NEXT: sxtb lr, r12 ; ENABLE-NEXT: cmn lr, #1 -; ENABLE-NEXT: bgt .LBB0_7 -; ENABLE-NEXT: @ %bb.11: @ %if.then7 -; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 +; ENABLE-NEXT: bgt .LBB0_8 +; ENABLE-NEXT: @ %bb.12: @ %if.then7 +; ENABLE-NEXT: @ in Loop: Header=BB0_8 Depth=1 ; ENABLE-NEXT: cmp r0, r2 -; ENABLE-NEXT: bls .LBB0_7 -; ENABLE-NEXT: @ %bb.12: @ %land.rhs14.preheader -; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 -; ENABLE-NEXT: cmn lr, #1 -; ENABLE-NEXT: bgt .LBB0_7 +; ENABLE-NEXT: bls .LBB0_8 ; ENABLE-NEXT: @ %bb.13: @ %land.rhs14.preheader -; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 +; ENABLE-NEXT: @ in Loop: Header=BB0_8 Depth=1 +; ENABLE-NEXT: cmn lr, #1 +; ENABLE-NEXT: bgt .LBB0_8 +; ENABLE-NEXT: @ %bb.14: @ %land.rhs14.preheader +; ENABLE-NEXT: @ in Loop: Header=BB0_8 Depth=1 ; ENABLE-NEXT: cmp r12, #191 -; ENABLE-NEXT: bhi .LBB0_7 -; ENABLE-NEXT: @ %bb.14: @ %while.body24.preheader -; ENABLE-NEXT: @ in Loop: Header=BB0_7 Depth=1 +; ENABLE-NEXT: bhi .LBB0_8 +; ENABLE-NEXT: @ %bb.15: @ %while.body24.preheader +; ENABLE-NEXT: @ in Loop: Header=BB0_8 Depth=1 ; ENABLE-NEXT: sub r3, r3, #2 -; ENABLE-NEXT: .LBB0_15: @ %while.body24 -; ENABLE-NEXT: @ Parent Loop BB0_7 Depth=1 +; ENABLE-NEXT: .LBB0_16: @ %while.body24 +; ENABLE-NEXT: @ Parent Loop BB0_8 Depth=1 ; ENABLE-NEXT: @ => This Inner Loop Header: Depth=2 ; ENABLE-NEXT: mov r0, r3 ; ENABLE-NEXT: cmp r3, r2 -; ENABLE-NEXT: bls .LBB0_7 -; ENABLE-NEXT: @ %bb.16: @ %while.body24.land.rhs14_crit_edge -; ENABLE-NEXT: @ in Loop: Header=BB0_15 Depth=2 +; ENABLE-NEXT: bls .LBB0_8 +; ENABLE-NEXT: @ %bb.17: @ %while.body24.land.rhs14_crit_edge +; ENABLE-NEXT: @ in Loop: Header=BB0_16 Depth=2 ; ENABLE-NEXT: mov r3, r0 ; ENABLE-NEXT: ldrsb lr, [r3], #-1 ; ENABLE-NEXT: cmn lr, #1 ; ENABLE-NEXT: uxtb r12, lr -; ENABLE-NEXT: bgt .LBB0_7 -; ENABLE-NEXT: @ %bb.17: @ %while.body24.land.rhs14_crit_edge -; ENABLE-NEXT: @ in Loop: Header=BB0_15 Depth=2 +; ENABLE-NEXT: bgt .LBB0_8 +; ENABLE-NEXT: @ %bb.18: @ %while.body24.land.rhs14_crit_edge +; ENABLE-NEXT: @ in Loop: Header=BB0_16 Depth=2 ; ENABLE-NEXT: cmp r12, #192 -; ENABLE-NEXT: blo .LBB0_15 -; ENABLE-NEXT: b .LBB0_7 -; ENABLE-NEXT: .LBB0_18: +; ENABLE-NEXT: blo .LBB0_16 +; ENABLE-NEXT: b .LBB0_8 +; ENABLE-NEXT: .LBB0_19: ; ENABLE-NEXT: mov r0, r3 ; ENABLE-NEXT: pop {r11, pc} ; diff --git a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll --- a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll @@ -224,14 +224,15 @@ ; CHECK-NEXT: vldrb.u8 q0, [r12], #16 ; CHECK-NEXT: vstrb.8 q0, [r4], #16 ; CHECK-NEXT: letp lr, .LBB10_2 -; CHECK-NEXT: .LBB10_3: @ %for.body +; CHECK-NEXT: .LBB10_3: @ %prehead +; CHECK-NEXT: pop.w {r4, lr} +; CHECK-NEXT: .LBB10_4: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r3, [r0], #1 ; CHECK-NEXT: subs r2, #2 ; CHECK-NEXT: strb r3, [r1], #1 -; CHECK-NEXT: bne .LBB10_3 -; CHECK-NEXT: @ %bb.4: -; CHECK-NEXT: pop.w {r4, lr} +; CHECK-NEXT: bne .LBB10_4 +; CHECK-NEXT: @ %bb.5: @ %for.cond.cleanup ; CHECK-NEXT: bx lr entry: %cmp6 = icmp slt i32 %n, 0 diff --git a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll --- a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll +++ b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll @@ -15,9 +15,7 @@ ; CHECK: else: ; CHECK-NEXT: br label [[SINK]] ; CHECK: sink: -; CHECK-NEXT: [[PTR1:%.*]] = phi ptr [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 2), [[IF]] ], [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 1), [[ELSE]] ] -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR1]], align 1 -; CHECK-NEXT: ret i8 [[LOAD]] +; CHECK-NEXT: ret i8 0 ; entry: %alloca = alloca [32 x i8], align 4, addrspace(1) @@ -114,13 +112,11 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[BB_0:%.*]] ; CHECK: bb.0: -; CHECK-NEXT: [[PTR1:%.*]] = phi ptr [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 1), [[ENTRY:%.*]] ], [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 2), [[BB_1:%.*]] ] -; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB_1]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB_1:%.*]], label [[EXIT:%.*]] ; CHECK: bb.1: ; CHECK-NEXT: br label [[BB_0]] ; CHECK: exit: -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR1]], align 1 -; CHECK-NEXT: ret i8 [[LOAD]] +; CHECK-NEXT: ret i8 0 ; entry: %alloca = alloca [32 x i8], align 4, addrspace(1) @@ -171,13 +167,11 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[BB_0:%.*]] ; CHECK: bb.0: -; CHECK-NEXT: [[PTR1:%.*]] = phi ptr [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 1), [[ENTRY:%.*]] ], [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 2), [[BB_1:%.*]] ] -; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB_1]], label [[EXIT:%.*]] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB_1:%.*]], label [[EXIT:%.*]] ; CHECK: bb.1: ; CHECK-NEXT: br label [[BB_0]] ; CHECK: exit: -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR1]], align 1 -; CHECK-NEXT: ret i8 [[LOAD]] +; CHECK-NEXT: ret i8 0 ; entry: %alloca = alloca [32 x i8], align 4, addrspace(1) @@ -288,9 +282,7 @@ ; CHECK: if: ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: -; CHECK-NEXT: [[PHI1:%.*]] = phi ptr addrspace(1) [ @g2, [[IF]] ], [ getelementptr inbounds ([32 x i8], ptr addrspace(1) @g2, i64 0, i64 2), [[ENTRY:%.*]] ] -; CHECK-NEXT: [[V:%.*]] = load i32, ptr addrspace(1) [[PHI1]], align 4 -; CHECK-NEXT: ret i32 [[V]] +; CHECK-NEXT: ret i32 0 ; entry: %a = alloca [32 x i8] @@ -312,12 +304,9 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ @g1, [[ENTRY:%.*]] ], [ [[PTR_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[PTR_NEXT]] = getelementptr i8, ptr [[PTR]], i64 4 ; CHECK-NEXT: br i1 [[C:%.*]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[PTR]], align 4 -; CHECK-NEXT: ret i32 [[V]] +; CHECK-NEXT: ret i32 0 ; entry: %alloca = alloca [32 x i8] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll @@ -62,8 +62,8 @@ ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[NEXT_GEP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 -; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996 +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.body: ; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ] @@ -881,62 +881,31 @@ define hidden void @mult_ptr_iv(ptr noalias nocapture readonly %x, ptr noalias nocapture %z) { ; CHECK-LABEL: @mult_ptr_iv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[Z:%.*]], i32 3000 -; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[X:%.*]], i32 3000 -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[Z]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[X]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[X]], i32 3000 -; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[Z]], i32 3000 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[X]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[POINTER_PHI5:%.*]] = phi ptr [ [[Z]], [[VECTOR_PH]] ], [ [[PTR_IND6:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[X:%.*]], [[ENTRY:%.*]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[POINTER_PHI4:%.*]] = phi ptr [ [[Z:%.*]], [[ENTRY]] ], [ [[PTR_IND5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI5]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI4]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP0]], i32 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP0]], i32 1, <4 x i1> , <4 x i8> poison), !alias.scope !28 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP0]], i32 1, <4 x i1> , <4 x i8> poison) ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP0]], i32 2 -; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> , <4 x i8> poison), !alias.scope !28 -; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP3]], i32 1, <4 x i1> , <4 x i8> poison), !alias.scope !28 +; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> , <4 x i8> poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP3]], i32 1, <4 x i1> , <4 x i8> poison) ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], -; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER7]] -; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER8]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER6]] +; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER7]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i32 1 -; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP4]], <4 x ptr> [[TMP1]], i32 1, <4 x i1> ), !alias.scope !31, !noalias !28 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP4]], <4 x ptr> [[TMP1]], i32 1, <4 x i1> ) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i32 2 -; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP5]], <4 x ptr> [[TMP7]], i32 1, <4 x i1> ), !alias.scope !31, !noalias !28 -; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP6]], <4 x ptr> [[TMP8]], i32 1, <4 x i1> ), !alias.scope !31, !noalias !28 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP5]], <4 x ptr> [[TMP7]], i32 1, <4 x i1> ) +; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP6]], <4 x ptr> [[TMP8]], i32 1, <4 x i1> ) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12 -; CHECK-NEXT: [[PTR_IND6]] = getelementptr i8, ptr [[POINTER_PHI5]], i32 12 +; CHECK-NEXT: [[PTR_IND5]] = getelementptr i8, ptr [[POINTER_PHI4]], i32 12 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP9]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] -; CHECK: for.body: -; CHECK-NEXT: [[X_ADDR_050:%.*]] = phi ptr [ [[INCDEC_PTR2:%.*]], [[FOR_BODY]] ], [ [[X]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[Z_ADDR_049:%.*]] = phi ptr [ [[INCDEC_PTR34:%.*]], [[FOR_BODY]] ], [ [[Z]], [[ENTRY]] ] -; CHECK-NEXT: [[I_048:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr [[X_ADDR_050]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[X_ADDR_050]], align 1 -; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i8, ptr [[X_ADDR_050]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1 -; CHECK-NEXT: [[INCDEC_PTR2]] = getelementptr inbounds i8, ptr [[X_ADDR_050]], i32 3 -; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[INCDEC_PTR1]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP10]], 10 -; CHECK-NEXT: [[MUL1:%.*]] = mul i8 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[MUL2:%.*]] = mul i8 [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[INCDEC_PTR32:%.*]] = getelementptr inbounds i8, ptr [[Z_ADDR_049]], i32 1 -; CHECK-NEXT: store i8 [[MUL]], ptr [[Z_ADDR_049]], align 1 -; CHECK-NEXT: [[INCDEC_PTR33:%.*]] = getelementptr inbounds i8, ptr [[Z_ADDR_049]], i32 2 -; CHECK-NEXT: store i8 [[MUL1]], ptr [[INCDEC_PTR32]], align 1 -; CHECK-NEXT: [[INCDEC_PTR34]] = getelementptr inbounds i8, ptr [[Z_ADDR_049]], i32 3 -; CHECK-NEXT: store i8 [[MUL2]], ptr [[INCDEC_PTR33]], align 1 -; CHECK-NEXT: [[INC]] = add nuw i32 [[I_048]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -856,120 +856,98 @@ ; AVX512-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 ; AVX512-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 ; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 -; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; AVX512: vector.memcheck: -; AVX512-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2 -; AVX512-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4 -; AVX512-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 2 -; AVX512-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 6 -; AVX512-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8 -; AVX512-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP8]] -; AVX512-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP6]], 2 -; AVX512-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 4 -; AVX512-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP10]] -; AVX512-NEXT: [[TMP11:%.*]] = mul nsw i64 [[IDX_EXT]], -4 -; AVX512-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP11]] -; AVX512-NEXT: [[TMP12:%.*]] = sub i64 [[TMP10]], [[TMP4]] -; AVX512-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP12]] -; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DEST]], [[UGLYGEP1]] -; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[PTR]], [[UGLYGEP]] -; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; AVX512-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[UGLYGEP3]] -; AVX512-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP]] -; AVX512-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] -; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] -; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; AVX512: vector.main.loop.iter.check: -; AVX512-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP3]], 16 -; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; AVX512-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], 16 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16 ; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] -; AVX512-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 4 -; AVX512-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP13]] -; AVX512-NEXT: [[TMP14:%.*]] = mul i64 [[N_VEC]], 64 -; AVX512-NEXT: [[IND_END9:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP14]] +; AVX512-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 4 +; AVX512-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP5:%.*]] = mul i64 [[N_VEC]], 64 +; AVX512-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP5]] ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0 -; AVX512-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; AVX512-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP16]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP19]], align 4, !alias.scope !14 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> [[TMP17]], i32 4, <16 x i1> ), !alias.scope !17, !noalias !19 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x float>, ptr [[TMP20]], align 4, !alias.scope !21 -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, <16 x ptr> [[TMP17]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD8]], <16 x ptr> [[TMP21]], i32 4, <16 x i1> ), !alias.scope !17, !noalias !19 +; AVX512-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; AVX512-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; AVX512-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP7]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP]], i64 [[IDXPROM]] +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP10]], align 4 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> [[TMP8]], i32 4, <16 x i1> ) +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x float>, ptr [[TMP11]], align 4 +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, <16 x ptr> [[TMP8]], i64 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD2]], <16 x ptr> [[TMP12]], i32 4, <16 x i1> ) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX512-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 1024 -; AVX512-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; AVX512-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; AVX512: vec.epilog.iter.check: -; AVX512-NEXT: [[TMP23:%.*]] = mul i64 [[N_VEC]], 64 -; AVX512-NEXT: [[IND_END17:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP23]] -; AVX512-NEXT: [[TMP24:%.*]] = mul i64 [[N_VEC]], 4 -; AVX512-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP24]] +; AVX512-NEXT: [[TMP14:%.*]] = mul i64 [[N_VEC]], 64 +; AVX512-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP14]] +; AVX512-NEXT: [[TMP15:%.*]] = mul i64 [[N_VEC]], 4 +; AVX512-NEXT: [[IND_END8:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP15]] ; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] ; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 ; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; AVX512: vec.epilog.ph: ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; AVX512-NEXT: [[BC_RESUME_VAL10:%.*]] = phi ptr [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; AVX512-NEXT: [[N_MOD_VF11:%.*]] = urem i64 [[TMP3]], 8 -; AVX512-NEXT: [[N_VEC12:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF11]] -; AVX512-NEXT: [[TMP25:%.*]] = mul i64 [[N_VEC12]], 4 -; AVX512-NEXT: [[IND_END13:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP25]] -; AVX512-NEXT: [[TMP26:%.*]] = mul i64 [[N_VEC12]], 64 -; AVX512-NEXT: [[IND_END16:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP26]] +; AVX512-NEXT: [[N_MOD_VF5:%.*]] = urem i64 [[TMP3]], 8 +; AVX512-NEXT: [[N_VEC6:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF5]] +; AVX512-NEXT: [[TMP16:%.*]] = mul i64 [[N_VEC6]], 4 +; AVX512-NEXT: [[IND_END7:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP16]] +; AVX512-NEXT: [[TMP17:%.*]] = mul i64 [[N_VEC6]], 64 +; AVX512-NEXT: [[IND_END10:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP17]] ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; AVX512: vec.epilog.vector.body: -; AVX512-NEXT: [[POINTER_PHI22:%.*]] = phi ptr [ [[BC_RESUME_VAL10]], [[VEC_EPILOG_PH]] ], [ [[PTR_IND23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP27:%.*]] = add i64 [[INDEX20]], 0 -; AVX512-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4 -; AVX512-NEXT: [[NEXT_GEP21:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP28]] -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[POINTER_PHI22]], <8 x i64> -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP21]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP30]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope !23 -; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD24]], <8 x ptr> [[TMP29]], i32 4, <8 x i1> ), !alias.scope !26, !noalias !28 -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr float, ptr [[NEXT_GEP21]], i32 0 -; AVX512-NEXT: [[WIDE_LOAD25:%.*]] = load <8 x float>, ptr [[TMP32]], align 4, !alias.scope !30 -; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP29]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD25]], <8 x ptr> [[TMP33]], i32 4, <8 x i1> ), !alias.scope !26, !noalias !28 -; AVX512-NEXT: [[INDEX_NEXT26]] = add nuw i64 [[INDEX20]], 8 -; AVX512-NEXT: [[PTR_IND23]] = getelementptr i8, ptr [[POINTER_PHI22]], i64 512 -; AVX512-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC12]] -; AVX512-NEXT: br i1 [[TMP34]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; AVX512-NEXT: [[POINTER_PHI16:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_PH]] ], [ [[PTR_IND17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT20:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP18:%.*]] = add i64 [[INDEX14]], 0 +; AVX512-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; AVX512-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP19]] +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[POINTER_PHI16]], <8 x i64> +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP15]], i64 [[IDXPROM]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP22]], align 4 +; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP20]], i32 4, <8 x i1> ) +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr float, ptr [[NEXT_GEP15]], i32 0 +; AVX512-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x float>, ptr [[TMP23]], align 4 +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP20]], i64 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD19]], <8 x ptr> [[TMP24]], i32 4, <8 x i1> ) +; AVX512-NEXT: [[INDEX_NEXT20]] = add nuw i64 [[INDEX14]], 8 +; AVX512-NEXT: [[PTR_IND17]] = getelementptr i8, ptr [[POINTER_PHI16]], i64 512 +; AVX512-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT20]], [[N_VEC6]] +; AVX512-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; AVX512: vec.epilog.middle.block: -; AVX512-NEXT: [[CMP_N19:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC12]] -; AVX512-NEXT: br i1 [[CMP_N19]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX512-NEXT: [[CMP_N13:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC6]] +; AVX512-NEXT: br i1 [[CMP_N13]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL15:%.*]] = phi ptr [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[ITER_CHECK]] ] -; AVX512-NEXT: [[BC_RESUME_VAL18:%.*]] = phi ptr [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END17]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END7]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR]], [[ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL12:%.*]] = phi ptr [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[ITER_CHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: -; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL15]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] -; AVX512-NEXT: [[DEST_ADDR_011:%.*]] = phi ptr [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] +; AVX512-NEXT: [[DEST_ADDR_011:%.*]] = phi ptr [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP35:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; AVX512-NEXT: store float [[TMP35]], ptr [[DEST_ADDR_011]], align 4 -; AVX512-NEXT: [[TMP36:%.*]] = load float, ptr [[PTR_ADDR_012]], align 4 +; AVX512-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; AVX512-NEXT: store float [[TMP26]], ptr [[DEST_ADDR_011]], align 4 +; AVX512-NEXT: [[TMP27:%.*]] = load float, ptr [[PTR_ADDR_012]], align 4 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 1 -; AVX512-NEXT: store float [[TMP36]], ptr [[ARRAYIDX5]], align 4 +; AVX512-NEXT: store float [[TMP27]], ptr [[ARRAYIDX5]], align 4 ; AVX512-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1 ; AVX512-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16 ; AVX512-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]] -; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -987,86 +965,64 @@ ; FVW2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 ; FVW2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 ; FVW2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2 -; FVW2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; FVW2: vector.memcheck: -; FVW2-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2 -; FVW2-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4 -; FVW2-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 2 -; FVW2-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 6 -; FVW2-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8 -; FVW2-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP8]] -; FVW2-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP6]], 2 -; FVW2-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 4 -; FVW2-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP10]] -; FVW2-NEXT: [[TMP11:%.*]] = mul nsw i64 [[IDX_EXT]], -4 -; FVW2-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP11]] -; FVW2-NEXT: [[TMP12:%.*]] = sub i64 [[TMP10]], [[TMP4]] -; FVW2-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP12]] -; FVW2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DEST]], [[UGLYGEP1]] -; FVW2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[PTR]], [[UGLYGEP]] -; FVW2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; FVW2-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[UGLYGEP3]] -; FVW2-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP]] -; FVW2-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]] -; FVW2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]] -; FVW2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; FVW2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; FVW2: vector.ph: ; FVW2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2 ; FVW2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] -; FVW2-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 4 -; FVW2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP13]] -; FVW2-NEXT: [[TMP14:%.*]] = mul i64 [[N_VEC]], 64 -; FVW2-NEXT: [[IND_END7:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP14]] +; FVW2-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 4 +; FVW2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP4]] +; FVW2-NEXT: [[TMP5:%.*]] = mul i64 [[N_VEC]], 64 +; FVW2-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP5]] ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: ; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0 -; FVW2-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; FVW2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP16]] -; FVW2-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 0 -; FVW2-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 64 -; FVW2-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP18]] -; FVW2-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 1 -; FVW2-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 64 -; FVW2-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP20]] -; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0 -; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP22]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 -; FVW2-NEXT: store float [[TMP23]], ptr [[NEXT_GEP9]], align 4, !alias.scope !17, !noalias !19 -; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 -; FVW2-NEXT: store float [[TMP24]], ptr [[NEXT_GEP10]], align 4, !alias.scope !17, !noalias !19 -; FVW2-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0 -; FVW2-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x float>, ptr [[TMP25]], align 4, !alias.scope !21 -; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP9]], i64 1 -; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP10]], i64 1 -; FVW2-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD11]], i32 0 -; FVW2-NEXT: store float [[TMP28]], ptr [[TMP26]], align 4, !alias.scope !17, !noalias !19 -; FVW2-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD11]], i32 1 -; FVW2-NEXT: store float [[TMP29]], ptr [[TMP27]], align 4, !alias.scope !17, !noalias !19 +; FVW2-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; FVW2-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; FVW2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP7]] +; FVW2-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; FVW2-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 64 +; FVW2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP9]] +; FVW2-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 +; FVW2-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 64 +; FVW2-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP11]] +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP]], i64 [[IDXPROM]] +; FVW2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 +; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP13]], align 4 +; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 +; FVW2-NEXT: store float [[TMP14]], ptr [[NEXT_GEP3]], align 4 +; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 +; FVW2-NEXT: store float [[TMP15]], ptr [[NEXT_GEP4]], align 4 +; FVW2-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0 +; FVW2-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP16]], align 4 +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP3]], i64 1 +; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP4]], i64 1 +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[WIDE_LOAD5]], i32 0 +; FVW2-NEXT: store float [[TMP19]], ptr [[TMP17]], align 4 +; FVW2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[WIDE_LOAD5]], i32 1 +; FVW2-NEXT: store float [[TMP20]], ptr [[TMP18]], align 4 ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; FVW2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; FVW2-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; FVW2-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FVW2-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; FVW2: middle.block: ; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]] ; FVW2: scalar.ph: -; FVW2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ] -; FVW2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ] +; FVW2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ] +; FVW2-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ] ; FVW2-NEXT: br label [[FOR_BODY:%.*]] ; FVW2: for.body: ; FVW2-NEXT: [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] -; FVW2-NEXT: [[DEST_ADDR_011:%.*]] = phi ptr [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ] +; FVW2-NEXT: [[DEST_ADDR_011:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ] ; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP31:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; FVW2-NEXT: store float [[TMP31]], ptr [[DEST_ADDR_011]], align 4 -; FVW2-NEXT: [[TMP32:%.*]] = load float, ptr [[PTR_ADDR_012]], align 4 +; FVW2-NEXT: [[TMP22:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; FVW2-NEXT: store float [[TMP22]], ptr [[DEST_ADDR_011]], align 4 +; FVW2-NEXT: [[TMP23:%.*]] = load float, ptr [[PTR_ADDR_012]], align 4 ; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 1 -; FVW2-NEXT: store float [[TMP32]], ptr [[ARRAYIDX5]], align 4 +; FVW2-NEXT: store float [[TMP23]], ptr [[ARRAYIDX5]], align 4 ; FVW2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1 ; FVW2-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16 ; FVW2-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]] -; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ;