Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -19958,8 +19958,8 @@ ``llvm.get.active.lane.mask.*``, ``%icmp`` is an integer compare and ``ult`` the unsigned less-than comparison operator. Overflow cannot occur in ``(%base + i)`` and its comparison against ``%n`` as it is performed in integer -numbers and not in machine numbers. If ``%n`` is ``0``, then the result is a -poison value. The above is equivalent to: +numbers and not in machine numbers. If ``%n`` unsigned less than ``%base``, then +the result is a poison value. The above is equivalent to: :: Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7180,21 +7180,26 @@ } SDValue TripCount = getValue(I.getOperand(1)); - auto VecTy = CCVT.changeVectorElementType(ElementVT); - SDValue VectorIndex, VectorTripCount; + // We can lower to: + // icmp ult step_vector, splat(TripCount-Index). + // + // Reasoning: By assumption, Index @lane_mask_nxv16i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_nxv16i1_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: index z0.b, #0, #1 -; CHECK-NEXT: mov z1.b, w0 -; CHECK-NEXT: uqadd z0.b, z0.b, z1.b -; CHECK-NEXT: mov z1.b, w1 +; CHECK-NEXT: sub w8, w1, w0 +; CHECK-NEXT: index z1.b, #0, #1 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: cmphi p0.b, p0/z, z1.b, z0.b +; CHECK-NEXT: mov z0.b, w8 +; CHECK-NEXT: cmphi p0.b, p0/z, z0.b, z1.b ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv16i1.i8(i8 %index, i8 %TC) ret %active.lane.mask @@ -92,17 +91,13 @@ define @lane_mask_nxv8i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_nxv8i1_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: index z0.h, #0, #1 -; CHECK-NEXT: mov z1.h, w0 -; CHECK-NEXT: and z0.h, z0.h, #0xff -; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: add z0.h, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w1 -; CHECK-NEXT: umin z0.h, z0.h, #255 +; CHECK-NEXT: sub w8, w1, w0 +; CHECK-NEXT: index z1.h, #0, #1 ; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: cmphi p0.h, p0/z, z1.h, z0.h +; CHECK-NEXT: mov z0.h, w8 +; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: cmphi p0.h, p0/z, z0.h, z1.h ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv8i1.i8(i8 %index, i8 %TC) ret %active.lane.mask @@ -111,17 +106,13 @@ define @lane_mask_nxv4i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_nxv4i1_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: mov z1.s, w0 -; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: sub w8, w1, w0 +; CHECK-NEXT: index z1.s, #0, #1 ; CHECK-NEXT: and z1.s, z1.s, #0xff -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: mov z1.s, w1 -; CHECK-NEXT: umin z0.s, z0.s, #255 -; CHECK-NEXT: and z1.s, z1.s, #0xff -; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: cmphi p0.s, p0/z, z1.s, z0.s +; CHECK-NEXT: mov z0.s, w8 +; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: cmphi p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv4i1.i8(i8 %index, i8 %TC) ret %active.lane.mask @@ -130,19 +121,13 @@ define @lane_mask_nxv2i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_nxv2i1_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xff +; CHECK-NEXT: sub w8, w1, w0 +; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: and z1.d, z1.d, #0xff -; CHECK-NEXT: add z0.d, z0.d, z1.d -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: mov z2.d, x1 -; CHECK-NEXT: umin z0.d, z0.d, #255 -; CHECK-NEXT: and z2.d, z2.d, #0xff -; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z0.d, x8 +; CHECK-NEXT: and z0.d, z0.d, #0xff +; CHECK-NEXT: cmphi p0.d, p0/z, z0.d, z1.d ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv2i1.i8(i8 %index, i8 %TC) ret %active.lane.mask @@ -160,42 +145,34 @@ ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: mov z3.s, w0 +; CHECK-NEXT: sub w8, w1, w0 ; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z4.s, w1 ; CHECK-NEXT: incw z1.s -; CHECK-NEXT: uqadd z5.s, z0.s, z3.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: mov z3.s, w8 ; CHECK-NEXT: incw z2.s, all, mul #2 -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: cmphi p1.s, p0/z, z4.s, z5.s -; CHECK-NEXT: uqadd z5.s, z1.s, z3.s -; CHECK-NEXT: cmphi p2.s, p0/z, z4.s, z5.s -; CHECK-NEXT: uqadd z5.s, z2.s, z3.s -; CHECK-NEXT: incw z6.s, all, mul #2 -; CHECK-NEXT: incw z0.s, all, mul #4 -; CHECK-NEXT: cmphi p3.s, p0/z, z4.s, z5.s -; CHECK-NEXT: uqadd z5.s, z6.s, z3.s -; CHECK-NEXT: incw z1.s, all, mul #4 +; CHECK-NEXT: incw z4.s, all, mul #2 ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: cmphi p4.s, p0/z, z4.s, z5.s -; CHECK-NEXT: uqadd z0.s, z0.s, z3.s -; CHECK-NEXT: uqadd z1.s, z1.s, z3.s +; CHECK-NEXT: cmphi p1.s, p0/z, z3.s, z2.s +; CHECK-NEXT: cmphi p2.s, p0/z, z3.s, z1.s +; CHECK-NEXT: cmphi p3.s, p0/z, z3.s, z0.s +; CHECK-NEXT: incw z1.s, all, mul #4 +; CHECK-NEXT: incw z0.s, all, mul #4 +; CHECK-NEXT: cmphi p4.s, p0/z, z3.s, z4.s ; CHECK-NEXT: incw z2.s, all, mul #4 -; CHECK-NEXT: incw z6.s, all, mul #4 -; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h -; CHECK-NEXT: uzp1 p2.h, p3.h, p4.h -; CHECK-NEXT: cmphi p3.s, p0/z, z4.s, z0.s -; CHECK-NEXT: cmphi p4.s, p0/z, z4.s, z1.s -; CHECK-NEXT: uqadd z0.s, z2.s, z3.s -; CHECK-NEXT: uqadd z1.s, z6.s, z3.s +; CHECK-NEXT: incw z4.s, all, mul #4 ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: cmphi p5.s, p0/z, z4.s, z0.s -; CHECK-NEXT: cmphi p0.s, p0/z, z4.s, z1.s -; CHECK-NEXT: uzp1 p3.h, p3.h, p4.h -; CHECK-NEXT: uzp1 p4.h, p5.h, p0.h -; CHECK-NEXT: uzp1 p0.b, p1.b, p2.b +; CHECK-NEXT: uzp1 p2.h, p3.h, p2.h +; CHECK-NEXT: cmphi p3.s, p0/z, z3.s, z1.s +; CHECK-NEXT: cmphi p5.s, p0/z, z3.s, z0.s +; CHECK-NEXT: uzp1 p1.h, p1.h, p4.h +; CHECK-NEXT: cmphi p4.s, p0/z, z3.s, z2.s +; CHECK-NEXT: cmphi p0.s, p0/z, z3.s, z4.s +; CHECK-NEXT: uzp1 p3.h, p5.h, p3.h +; CHECK-NEXT: uzp1 p4.h, p4.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p2.b, p1.b ; CHECK-NEXT: uzp1 p1.b, p3.b, p4.b ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload @@ -215,75 +192,59 @@ ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: mov z3.d, x0 +; CHECK-NEXT: sub x8, x1, x0 ; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z4.d, x1 ; CHECK-NEXT: incd z1.d -; CHECK-NEXT: uqadd z5.d, z0.d, z3.d -; CHECK-NEXT: uqadd z6.d, z1.d, z3.d -; CHECK-NEXT: cmphi p1.d, p0/z, z4.d, z5.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z5.d, z1.d +; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: incd z2.d, all, mul #2 -; CHECK-NEXT: cmphi p2.d, p0/z, z4.d, z6.d -; CHECK-NEXT: uqadd z6.d, z2.d, z3.d -; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: incd z5.d, all, mul #2 -; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s -; CHECK-NEXT: cmphi p2.d, p0/z, z4.d, z6.d -; CHECK-NEXT: uqadd z6.d, z5.d, z3.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: cmphi p2.d, p0/z, z4.d, z1.d +; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z0.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: mov z24.d, z5.d +; CHECK-NEXT: cmphi p1.d, p0/z, z4.d, z2.d +; CHECK-NEXT: uzp1 p2.s, p3.s, p2.s +; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z5.d +; CHECK-NEXT: incd z6.d, all, mul #4 +; CHECK-NEXT: incd z3.d, all, mul #4 ; CHECK-NEXT: incd z7.d, all, mul #4 -; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z6.d -; CHECK-NEXT: uqadd z6.d, z7.d, z3.d -; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: incd z24.d, all, mul #4 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov z26.d, z5.d -; CHECK-NEXT: cmphi p4.d, p0/z, z4.d, z6.d -; CHECK-NEXT: uqadd z6.d, z24.d, z3.d -; CHECK-NEXT: incd z25.d, all, mul #4 -; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z6.d -; CHECK-NEXT: uqadd z6.d, z25.d, z3.d -; CHECK-NEXT: incd z26.d, all, mul #4 ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z6.d -; CHECK-NEXT: uqadd z6.d, z26.d, z3.d -; CHECK-NEXT: uzp1 p2.s, p2.s, p3.s +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s ; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z6.d -; CHECK-NEXT: incd z0.d, all, mul #8 +; CHECK-NEXT: cmphi p4.d, p0/z, z4.d, z3.d +; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z7.d +; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z24.d +; CHECK-NEXT: uzp1 p3.s, p4.s, p3.s +; CHECK-NEXT: uzp1 p4.s, p5.s, p6.s ; CHECK-NEXT: incd z1.d, all, mul #8 -; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s -; CHECK-NEXT: uzp1 p3.s, p6.s, p3.s -; CHECK-NEXT: uqadd z0.d, z0.d, z3.d -; CHECK-NEXT: uqadd z1.d, z1.d, z3.d +; CHECK-NEXT: incd z0.d, all, mul #8 ; CHECK-NEXT: incd z2.d, all, mul #8 ; CHECK-NEXT: incd z5.d, all, mul #8 -; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h -; CHECK-NEXT: uzp1 p2.h, p4.h, p3.h -; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z0.d -; CHECK-NEXT: cmphi p4.d, p0/z, z4.d, z1.d -; CHECK-NEXT: uqadd z0.d, z2.d, z3.d -; CHECK-NEXT: uqadd z1.d, z5.d, z3.d +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p2.h, p3.h, p4.h +; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z1.d +; CHECK-NEXT: cmphi p4.d, p0/z, z4.d, z0.d +; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z2.d +; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z5.d +; CHECK-NEXT: incd z3.d, all, mul #8 +; CHECK-NEXT: incd z6.d, all, mul #8 ; CHECK-NEXT: incd z7.d, all, mul #8 ; CHECK-NEXT: incd z24.d, all, mul #8 -; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z0.d -; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z1.d -; CHECK-NEXT: uqadd z0.d, z7.d, z3.d -; CHECK-NEXT: uqadd z1.d, z24.d, z3.d -; CHECK-NEXT: incd z25.d, all, mul #8 -; CHECK-NEXT: incd z26.d, all, mul #8 -; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s -; CHECK-NEXT: uzp1 p4.s, p5.s, p6.s -; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z0.d -; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z1.d -; CHECK-NEXT: uqadd z0.d, z25.d, z3.d -; CHECK-NEXT: uqadd z1.d, z26.d, z3.d ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: cmphi p7.d, p0/z, z4.d, z0.d -; CHECK-NEXT: cmphi p0.d, p0/z, z4.d, z1.d +; CHECK-NEXT: uzp1 p3.s, p4.s, p3.s +; CHECK-NEXT: uzp1 p4.s, p5.s, p6.s +; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z6.d +; CHECK-NEXT: cmphi p7.d, p0/z, z4.d, z7.d +; CHECK-NEXT: cmphi p0.d, p0/z, z4.d, z24.d ; CHECK-NEXT: uzp1 p5.s, p5.s, p6.s ; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s ; CHECK-NEXT: uzp1 p3.h, p3.h, p4.h @@ -304,17 +265,15 @@ define @lane_mask_nxv32i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_nxv32i1_i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sub w9, w1, w0 ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: index z0.b, #0, #1 -; CHECK-NEXT: mov z1.b, w8 -; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: add z1.b, z0.b, z1.b -; CHECK-NEXT: mov z3.b, w1 -; CHECK-NEXT: uqadd z0.b, z0.b, z2.b -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: uqadd z1.b, z1.b, z2.b -; CHECK-NEXT: cmphi p0.b, p1/z, z3.b, z0.b -; CHECK-NEXT: cmphi p1.b, p1/z, z3.b, z1.b +; CHECK-NEXT: mov z0.b, w8 +; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: add z0.b, z1.b, z0.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z2.b, w9 +; CHECK-NEXT: cmphi p1.b, p0/z, z2.b, z0.b +; CHECK-NEXT: cmphi p0.b, p0/z, z2.b, z1.b ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv32i1.i8(i8 %index, i8 %TC) ret %active.lane.mask @@ -415,10 +374,9 @@ ; CHECK-LABEL: lane_mask_v16i1_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI23_0 -; CHECK-NEXT: dup v1.16b, w0 +; CHECK-NEXT: sub w9, w1, w0 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI23_0] -; CHECK-NEXT: uqadd v0.16b, v1.16b, v0.16b -; CHECK-NEXT: dup v1.16b, w1 +; CHECK-NEXT: dup v1.16b, w9 ; CHECK-NEXT: cmhi v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i8(i8 %index, i8 %TC) @@ -428,12 +386,11 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_v8i1_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI24_0 -; CHECK-NEXT: dup v0.8b, w0 -; CHECK-NEXT: dup v2.8b, w1 -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI24_0] -; CHECK-NEXT: uqadd v0.8b, v0.8b, v1.8b -; CHECK-NEXT: cmhi v0.8b, v2.8b, v0.8b +; CHECK-NEXT: sub w8, w1, w0 +; CHECK-NEXT: adrp x9, .LCPI24_0 +; CHECK-NEXT: dup v0.8b, w8 +; CHECK-NEXT: ldr d1, [x9, :lo12:.LCPI24_0] +; CHECK-NEXT: cmhi v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i8(i8 %index, i8 %TC) ret <8 x i1> %active.lane.mask @@ -442,16 +399,12 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_v4i1_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI25_0 -; CHECK-NEXT: dup v0.4h, w0 -; CHECK-NEXT: movi d2, #0xff00ff00ff00ff -; CHECK-NEXT: dup v3.4h, w1 -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI25_0] +; CHECK-NEXT: sub w8, w1, w0 +; CHECK-NEXT: adrp x9, .LCPI25_0 +; CHECK-NEXT: dup v0.4h, w8 +; CHECK-NEXT: ldr d1, [x9, :lo12:.LCPI25_0] ; CHECK-NEXT: bic v0.4h, #255, lsl #8 -; CHECK-NEXT: bic v3.4h, #255, lsl #8 -; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: umin v0.4h, v0.4h, v2.4h -; CHECK-NEXT: cmhi v0.4h, v3.4h, v0.4h +; CHECK-NEXT: cmhi v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8 %index, i8 %TC) ret <4 x i1> %active.lane.mask @@ -460,16 +413,13 @@ define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_v2i1_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI26_0 -; CHECK-NEXT: movi d0, #0x0000ff000000ff -; CHECK-NEXT: dup v1.2s, w0 -; CHECK-NEXT: dup v3.2s, w1 -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI26_0] -; CHECK-NEXT: and v1.8b, v1.8b, v0.8b -; CHECK-NEXT: add v1.2s, v1.2s, v2.2s -; CHECK-NEXT: umin v1.2s, v1.2s, v0.2s -; CHECK-NEXT: and v0.8b, v3.8b, v0.8b -; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s +; CHECK-NEXT: sub w8, w1, w0 +; CHECK-NEXT: adrp x9, .LCPI26_0 +; CHECK-NEXT: movi d1, #0x0000ff000000ff +; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI26_0] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: cmhi v0.2s, v0.2s, v2.2s ; CHECK-NEXT: ret %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC) ret <2 x i1> %active.lane.mask Index: llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -4,10 +4,10 @@ define @get_lane_mask(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: get_lane_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; CHECK-NEXT: sub a0, a2, a1 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: ret %mask = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 %index, i64 %tc) ret %mask @@ -27,11 +27,10 @@ define @constant_nonzero_index(ptr %p, i64 %tc) { ; CHECK-LABEL: constant_nonzero_index: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; CHECK-NEXT: addi a0, a1, -24 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: li a0, 24 -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: vmsltu.vx v0, v8, a1 +; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: ret %mask = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 24, i64 %tc) ret %mask @@ -40,10 +39,10 @@ define @constant_tripcount(ptr %p, i64 %index) { ; CHECK-LABEL: constant_tripcount: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: li a0, 1024 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: ret %mask = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 %index, i64 1024) @@ -79,10 +78,10 @@ define <2 x i1> @fv2(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv2: ; CHECK: # %bb.0: +; CHECK-NEXT: sub a0, a2, a1 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: ret %mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 %index, i64 %tc) ret <2 x i1> %mask @@ -91,10 +90,10 @@ define <8 x i1> @fv8(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv8: ; CHECK: # %bb.0: +; CHECK-NEXT: sub a0, a2, a1 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: ret %mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 %index, i64 %tc) ret <8 x i1> %mask @@ -107,11 +106,10 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: sub a0, a2, a1 +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: ret @@ -126,27 +124,24 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: sub a0, a2, a1 +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 2 -; CHECK-NEXT: lui a0, %hi(.LCPI9_1) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1) +; CHECK-NEXT: lui a1, %hi(.LCPI9_1) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI9_1) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 4 -; CHECK-NEXT: lui a0, %hi(.LCPI9_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2) +; CHECK-NEXT: lui a1, %hi(.LCPI9_2) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI9_2) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 6 ; CHECK-NEXT: ret @@ -161,59 +156,52 @@ ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: sub a0, a2, a1 +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 2 -; CHECK-NEXT: lui a0, %hi(.LCPI10_1) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1) +; CHECK-NEXT: lui a1, %hi(.LCPI10_1) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI10_1) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 4 -; CHECK-NEXT: lui a0, %hi(.LCPI10_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2) +; CHECK-NEXT: lui a1, %hi(.LCPI10_2) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI10_2) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 6 -; CHECK-NEXT: lui a0, %hi(.LCPI10_3) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3) +; CHECK-NEXT: lui a1, %hi(.LCPI10_3) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI10_3) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 8 -; CHECK-NEXT: lui a0, %hi(.LCPI10_4) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4) +; CHECK-NEXT: lui a1, %hi(.LCPI10_4) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI10_4) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 10 -; CHECK-NEXT: lui a0, %hi(.LCPI10_5) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5) +; CHECK-NEXT: lui a1, %hi(.LCPI10_5) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI10_5) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 12 -; CHECK-NEXT: lui a0, %hi(.LCPI10_6) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6) +; CHECK-NEXT: lui a1, %hi(.LCPI10_6) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI10_6) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vmsltu.vx v16, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, mu ; CHECK-NEXT: vslideup.vi v0, v16, 14 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll @@ -7,30 +7,20 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #126 -; CHECK-NEXT: adr r2, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: mov.w r2, #500 -; CHECK-NEXT: vdup.32 q1, r2 ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vqadd.u32 q2, q0, r1 +; CHECK-NEXT: rsb.w r3, r1, #500 ; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vptt.u32 hi, q1, q2 -; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vaddvat.u32 r2, q2 +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: vaddvat.u32 r2, q0 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: br label %vector.body Index: llvm/test/CodeGen/Thumb2/active_lane_mask.ll =================================================================== --- llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -4,51 +4,17 @@ define <2 x i64> @v2i64(i32 %index, i32 %TC, <2 x i64> %V1, <2 x i64> %V2) { ; CHECK-LABEL: v2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov q1[2], q1[0], r0, r0 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vmov r0, r4, d3 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r1 -; CHECK-NEXT: vmov lr, r12, d2 -; CHECK-NEXT: adds r6, r0, #1 -; CHECK-NEXT: adc r4, r4, #0 -; CHECK-NEXT: subs.w r0, lr, #-1 -; CHECK-NEXT: vmov q1[2], q1[0], lr, r6 -; CHECK-NEXT: sbcs r0, r12, #0 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 -; CHECK-NEXT: csetm r12, lo -; CHECK-NEXT: subs.w r6, r6, #-1 -; CHECK-NEXT: bfi r5, r12, #0, #8 -; CHECK-NEXT: sbcs r6, r4, #0 -; CHECK-NEXT: mov.w r0, #0 -; CHECK-NEXT: csetm r6, lo -; CHECK-NEXT: bfi r5, r6, #8, #8 -; CHECK-NEXT: vmsr p0, r5 -; CHECK-NEXT: vpsel q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vmov r1, r4, d0 -; CHECK-NEXT: vmov r6, r5, d2 -; CHECK-NEXT: vmov d0, r2, r3 -; CHECK-NEXT: subs r1, r6, r1 -; CHECK-NEXT: sbcs.w r1, r5, r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: csetm r1, lo -; CHECK-NEXT: vldr d1, [sp, #16] -; CHECK-NEXT: bfi r0, r1, #0, #8 -; CHECK-NEXT: vmov r1, r6, d3 -; CHECK-NEXT: subs r1, r1, r5 -; CHECK-NEXT: sbcs.w r1, r6, r4 -; CHECK-NEXT: csetm r1, lo -; CHECK-NEXT: bfi r0, r1, #8, #8 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: add r0, sp, #24 +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: vldr d1, [sp] +; CHECK-NEXT: vctp.64 r0 +; CHECK-NEXT: add r0, sp, #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: bx lr %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC) %select = select <2 x i1> %active.lane.mask, <2 x i64> %V1, <2 x i64> %V2 ret <2 x i64> %select @@ -57,26 +23,17 @@ define <4 x i32> @v4i32(i32 %index, i32 %TC, <4 x i32> %V1, <4 x i32> %V2) { ; CHECK-LABEL: v4i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: adr.w r12, .LCPI1_0 -; CHECK-NEXT: vdup.32 q1, r1 -; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: vqadd.u32 q0, q0, r0 -; CHECK-NEXT: add r0, sp, #8 -; CHECK-NEXT: vcmp.u32 hi, q1, q0 +; CHECK-NEXT: subs r0, r1, r0 ; CHECK-NEXT: vldr d1, [sp] +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: add r0, sp, #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov d0, r2, r3 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d3 ; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI1_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %TC) %select = select <4 x i1> %active.lane.mask, <4 x i32> %V1, <4 x i32> %V2 ret <4 x i32> %select @@ -86,12 +43,16 @@ ; CHECK-LABEL: v7i32: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr.w r12, [sp, #40] -; CHECK-NEXT: vdup.32 q3, r2 +; CHECK-NEXT: subs r1, r2, r1 ; CHECK-NEXT: ldr r3, [sp, #32] -; CHECK-NEXT: adr r2, .LCPI2_1 +; CHECK-NEXT: vctp.32 r1 +; CHECK-NEXT: ldr r2, [sp, #52] +; CHECK-NEXT: vdup.32 q2, r1 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r12 ; CHECK-NEXT: ldr.w r12, [sp, #44] ; CHECK-NEXT: ldr r3, [sp, #36] +; CHECK-NEXT: adr r1, .LCPI2_0 +; CHECK-NEXT: vldrw.u32 q3, [r1] ; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 ; CHECK-NEXT: ldr.w r12, [sp, #8] ; CHECK-NEXT: ldr r3, [sp] @@ -99,26 +60,20 @@ ; CHECK-NEXT: ldr.w r12, [sp, #12] ; CHECK-NEXT: ldr r3, [sp, #4] ; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 -; CHECK-NEXT: adr r3, .LCPI2_0 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: vqadd.u32 q2, q2, r1 -; CHECK-NEXT: vcmp.u32 hi, q3, q2 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: ldr r3, [sp, #48] +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: ldr r2, [sp, #48] -; CHECK-NEXT: vqadd.u32 q0, q0, r1 -; CHECK-NEXT: ldr r1, [sp, #52] -; CHECK-NEXT: vcmp.u32 hi, q3, q0 -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldr r1, [sp, #56] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: ldr r1, [sp, #20] -; CHECK-NEXT: ldr r2, [sp, #16] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: ldr r1, [sp, #24] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: ldr r2, [sp, #56] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: ldr r2, [sp, #20] +; CHECK-NEXT: ldr r3, [sp, #16] +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: ldr r2, [sp, #24] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 +; CHECK-NEXT: vpt.u32 hi, q2, q3 +; CHECK-NEXT: vmovt q0, q1 ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: vmov r3, s0 @@ -129,11 +84,6 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI2_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .LCPI2_1: ; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 5 @ 0x5 ; CHECK-NEXT: .long 6 @ 0x6 @@ -146,54 +96,17 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) { ; CHECK-LABEL: v8i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: adr.w r12, .LCPI3_0 -; CHECK-NEXT: vdup.32 q1, r1 -; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: vmov.i8 q2, #0x0 -; CHECK-NEXT: vmov.i8 q3, #0xff -; CHECK-NEXT: vqadd.u32 q0, q0, r0 -; CHECK-NEXT: vcmp.u32 hi, q1, q0 -; CHECK-NEXT: vpsel q4, q3, q2 -; CHECK-NEXT: vmov r1, r12, d8 -; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov.16 q0[1], r12 -; CHECK-NEXT: vmov r1, r12, d9 -; CHECK-NEXT: vmov.16 q0[2], r1 -; CHECK-NEXT: adr r1, .LCPI3_1 -; CHECK-NEXT: vldrw.u32 q4, [r1] -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vqadd.u32 q4, q4, r0 -; CHECK-NEXT: vcmp.u32 hi, q1, q4 -; CHECK-NEXT: vpsel q1, q3, q2 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: add r0, sp, #24 -; CHECK-NEXT: vmov.16 q0[7], r1 +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: vldr d1, [sp] +; CHECK-NEXT: vctp.16 r0 +; CHECK-NEXT: add r0, sp, #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vcmp.i16 ne, q0, zr -; CHECK-NEXT: vldr d1, [sp, #16] ; CHECK-NEXT: vmov d0, r2, r3 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d3 ; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .LCPI3_1: -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 7 @ 0x7 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC) %select = select <8 x i1> %active.lane.mask, <8 x i16> %V1, <8 x i16> %V2 ret <8 x i16> %select @@ -202,122 +115,17 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-LABEL: v16i8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: adr.w r12, .LCPI4_0 -; CHECK-NEXT: vdup.32 q3, r1 -; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: vmov.i8 q1, #0xff -; CHECK-NEXT: vqadd.u32 q0, q0, r0 -; CHECK-NEXT: vcmp.u32 hi, q3, q0 -; CHECK-NEXT: vmov.i8 q0, #0x0 -; CHECK-NEXT: vpsel q4, q1, q0 -; CHECK-NEXT: vmov r1, r12, d8 -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov.16 q2[1], r12 -; CHECK-NEXT: vmov r1, r12, d9 -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: adr r1, .LCPI4_1 -; CHECK-NEXT: vldrw.u32 q4, [r1] -; CHECK-NEXT: vmov.16 q2[3], r12 -; CHECK-NEXT: vqadd.u32 q4, q4, r0 -; CHECK-NEXT: vcmp.u32 hi, q3, q4 -; CHECK-NEXT: vpsel q4, q1, q0 -; CHECK-NEXT: vmov r1, r12, d8 -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov.16 q2[5], r12 -; CHECK-NEXT: vmov r1, r12, d9 -; CHECK-NEXT: vmov.16 q2[6], r1 -; CHECK-NEXT: vmov.16 q2[7], r12 -; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vpsel q4, q1, q0 -; CHECK-NEXT: vmov.u16 r1, q4[0] -; CHECK-NEXT: vmov.8 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q4[1] -; CHECK-NEXT: vmov.8 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q4[2] -; CHECK-NEXT: vmov.8 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q4[3] -; CHECK-NEXT: vmov.8 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q4[4] -; CHECK-NEXT: vmov.8 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q4[5] -; CHECK-NEXT: vmov.8 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q4[6] -; CHECK-NEXT: vmov.8 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q4[7] -; CHECK-NEXT: vmov.8 q2[7], r1 -; CHECK-NEXT: adr r1, .LCPI4_2 -; CHECK-NEXT: vldrw.u32 q4, [r1] -; CHECK-NEXT: vqadd.u32 q4, q4, r0 -; CHECK-NEXT: vcmp.u32 hi, q3, q4 -; CHECK-NEXT: vpsel q5, q1, q0 -; CHECK-NEXT: vmov r1, r12, d10 -; CHECK-NEXT: vmov.16 q4[0], r1 -; CHECK-NEXT: vmov.16 q4[1], r12 -; CHECK-NEXT: vmov r1, r12, d11 -; CHECK-NEXT: vmov.16 q4[2], r1 -; CHECK-NEXT: adr r1, .LCPI4_3 -; CHECK-NEXT: vldrw.u32 q5, [r1] -; CHECK-NEXT: vmov.16 q4[3], r12 -; CHECK-NEXT: vqadd.u32 q5, q5, r0 -; CHECK-NEXT: vcmp.u32 hi, q3, q5 -; CHECK-NEXT: vpsel q3, q1, q0 -; CHECK-NEXT: vmov r0, r1, d6 -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.16 q4[5], r1 -; CHECK-NEXT: vmov r0, r1, d7 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov.16 q4[7], r1 -; CHECK-NEXT: vcmp.i16 ne, q4, zr -; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.8 q2[8], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.8 q2[9], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.8 q2[10], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.8 q2[11], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.8 q2[12], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.8 q2[13], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.8 q2[14], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.8 q2[15], r0 -; CHECK-NEXT: add r0, sp, #40 -; CHECK-NEXT: vldr d1, [sp, #32] +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: vldr d1, [sp] +; CHECK-NEXT: vctp.8 r0 +; CHECK-NEXT: add r0, sp, #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vcmp.i8 ne, q2, zr ; CHECK-NEXT: vmov d0, r2, r3 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpst +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r2, r3, d3 ; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI4_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .LCPI4_1: -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .LCPI4_2: -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 9 @ 0x9 -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .LCPI4_3: -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 13 @ 0xd -; CHECK-NEXT: .long 14 @ 0xe -; CHECK-NEXT: .long 15 @ 0xf %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC) %select = select <16 x i1> %active.lane.mask, <16 x i8> %V1, <16 x i8> %V2 ret <16 x i8> %select Index: llvm/test/CodeGen/Thumb2/mve-blockplacement.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -335,8 +335,8 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: mov r12, r1 @@ -347,7 +347,7 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: csel r7, r2, r3, lt -; CHECK-NEXT: mov r10, r2 +; CHECK-NEXT: mov r4, r2 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: it ls @@ -356,20 +356,18 @@ ; CHECK-NEXT: subs r1, r1, r7 ; CHECK-NEXT: movt r2, #43690 ; CHECK-NEXT: adds r1, #2 -; CHECK-NEXT: ldr r4, [sp, #120] ; CHECK-NEXT: movw r11, :lower16:c ; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: umull r1, r2, r1, r2 ; CHECK-NEXT: movt r11, :upper16:c +; CHECK-NEXT: umull r1, r2, r1, r2 +; CHECK-NEXT: movs r5, #12 ; CHECK-NEXT: movs r1, #4 ; CHECK-NEXT: @ implicit-def: $r8 -; CHECK-NEXT: @ implicit-def: $r9 -; CHECK-NEXT: movs r5, #12 -; CHECK-NEXT: strd r12, r0, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: @ implicit-def: $r10 +; CHECK-NEXT: strd r0, r12, [sp, #4] @ 8-byte Folded Spill ; CHECK-NEXT: add.w r6, r3, r2, lsr #1 ; CHECK-NEXT: add.w r1, r1, r2, lsr #1 ; CHECK-NEXT: movw r2, #65532 -; CHECK-NEXT: vdup.32 q6, r6 ; CHECK-NEXT: movt r2, #32767 ; CHECK-NEXT: ands r1, r2 ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill @@ -378,21 +376,19 @@ ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: adr r1, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI1_1 -; CHECK-NEXT: vldrw.u32 q5, [r1] ; CHECK-NEXT: vadd.i32 q4, q0, r7 ; CHECK-NEXT: @ implicit-def: $r7 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: cmn.w r9, #4 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: cmn.w r10, #4 ; CHECK-NEXT: it le ; CHECK-NEXT: mvnle r0, #3 ; CHECK-NEXT: movw r2, #18725 ; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: movt r2, #9362 -; CHECK-NEXT: sub.w r1, r0, r9 +; CHECK-NEXT: sub.w r1, r0, r10 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: umull r2, r3, r1, r2 ; CHECK-NEXT: subs r2, r1, r3 @@ -402,8 +398,8 @@ ; CHECK-NEXT: sub.w r2, r3, r2, lsr #2 ; CHECK-NEXT: subs r1, r2, r1 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: add.w r9, r0, #7 -; CHECK-NEXT: ldrd r12, r0, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: add.w r10, r0, #7 +; CHECK-NEXT: ldrd r0, r12, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 ; CHECK-NEXT: add.w r8, r8, #2 @@ -417,7 +413,7 @@ ; CHECK-NEXT: @ Child Loop BB1_8 Depth 2 ; CHECK-NEXT: @ Child Loop BB1_10 Depth 3 ; CHECK-NEXT: @ Child Loop BB1_12 Depth 3 -; CHECK-NEXT: cmp.w r9, #2 +; CHECK-NEXT: cmp.w r10, #2 ; CHECK-NEXT: bgt .LBB1_3 ; CHECK-NEXT: @ %bb.5: @ %for.body6.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 @@ -426,20 +422,20 @@ ; CHECK-NEXT: bhi .LBB1_15 ; CHECK-NEXT: @ %bb.6: @ %for.body6.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: ldrd r2, r3, [sp, #112] +; CHECK-NEXT: ldrd r2, r3, [sp, #80] ; CHECK-NEXT: movs r0, #32 ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: ldrd r12, r0, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r0, r12, [sp, #4] @ 8-byte Folded Reload ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r9, r10 ; CHECK-NEXT: b .LBB1_8 ; CHECK-NEXT: .LBB1_7: @ %for.cond.cleanup17.us ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: add.w r9, r3, #7 -; CHECK-NEXT: cmn.w r3, #4 +; CHECK-NEXT: add.w r10, r9, #7 +; CHECK-NEXT: cmn.w r9, #4 ; CHECK-NEXT: mov.w r7, #0 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r9, r10 ; CHECK-NEXT: bge .LBB1_3 ; CHECK-NEXT: .LBB1_8: @ %for.body6.us ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 @@ -447,8 +443,7 @@ ; CHECK-NEXT: @ Child Loop BB1_10 Depth 3 ; CHECK-NEXT: @ Child Loop BB1_12 Depth 3 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: beq .LBB1_11 +; CHECK-NEXT: cbz r4, .LBB1_11 ; CHECK-NEXT: @ %bb.9: @ %for.body13.us51.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 ; CHECK-NEXT: movw r2, :lower16:a @@ -478,19 +473,20 @@ ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vqadd.u32 q2, q5, r1 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vcmp.u32 hi, q6, q2 ; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: subs r3, r6, r1 +; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: add.w r1, r1, #4 -; CHECK-NEXT: vadd.i32 q2, q2, r11 ; CHECK-NEXT: vadd.i32 q1, q1, r5 +; CHECK-NEXT: vadd.i32 q2, q2, r11 +; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [q2] ; CHECK-NEXT: bne .LBB1_12 ; CHECK-NEXT: .LBB1_13: @ %for.cond9.for.cond15.preheader_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: ldr r1, [sp, #88] +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: beq .LBB1_7 ; CHECK-NEXT: @ %bb.14: @ %for.cond9.for.cond15.preheader_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 @@ -500,11 +496,12 @@ ; CHECK-NEXT: b .LBB1_26 ; CHECK-NEXT: .LBB1_15: @ %for.body6.lr.ph.split ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: ldr r0, [sp, #88] +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: beq.w .LBB1_2 ; CHECK-NEXT: @ %bb.16: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: ldrd r12, r0, [sp, #4] @ 8-byte Folded Reload -; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: ldrd r0, r12, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: .LBB1_17: @ %for.body6.us60 ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -524,23 +521,23 @@ ; CHECK-NEXT: bgt .LBB1_24 ; CHECK-NEXT: @ %bb.21: @ %for.cond.cleanup17.us63.3 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 -; CHECK-NEXT: add.w r9, r2, #28 +; CHECK-NEXT: add.w r10, r2, #28 ; CHECK-NEXT: cmn.w r2, #25 ; CHECK-NEXT: mov.w r7, #0 -; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: blt .LBB1_17 ; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_22: @ %for.cond.cleanup5.loopexit134.split.loop.exit139 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r9, r2, #7 +; CHECK-NEXT: add.w r10, r2, #7 ; CHECK-NEXT: b .LBB1_25 ; CHECK-NEXT: .LBB1_23: @ %for.cond.cleanup5.loopexit134.split.loop.exit137 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r9, r2, #14 +; CHECK-NEXT: add.w r10, r2, #14 ; CHECK-NEXT: b .LBB1_25 ; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit135 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r9, r2, #21 +; CHECK-NEXT: add.w r10, r2, #21 ; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 ; CHECK-NEXT: movs r7, #0 @@ -553,7 +550,7 @@ ; CHECK-NEXT: b .LBB1_27 ; CHECK-NEXT: .LBB1_28: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 @@ -563,11 +560,6 @@ ; CHECK-NEXT: .long 3 @ 0x3 ; CHECK-NEXT: .long 6 @ 0x6 ; CHECK-NEXT: .long 9 @ 0x9 -; CHECK-NEXT: .LCPI1_1: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: %cmp47 = icmp sgt i64 %e, 0 br i1 %cmp47, label %for.cond2.preheader.lr.ph, label %for.cond.cleanup