diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7129,12 +7129,10 @@ } SDValue VectorStep = DAG.getStepVector(sdl, VecTy); SDValue VectorInduction = DAG.getNode( - ISD::UADDO, sdl, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep); - SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction.getValue(0), + ISD::UADDSAT, sdl, VecTy, VectorIndex, VectorStep); + SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction, VectorTripCount, ISD::CondCode::SETULT); - setValue(&I, DAG.getNode(ISD::AND, sdl, CCVT, - DAG.getNOT(sdl, VectorInduction.getValue(1), CCVT), - SetCC)); + setValue(&I, SetCC); return; } case Intrinsic::experimental_vector_insert: { diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll --- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -76,14 +76,12 @@ define @lane_mask_nxv16i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_nxv16i1_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: index z0.b, w0, #1 +; CHECK-NEXT: index z0.b, #0, #1 ; CHECK-NEXT: mov z1.b, w0 -; CHECK-NEXT: mov z2.b, w1 -; CHECK-NEXT: cmphi p1.b, p0/z, z1.b, z0.b -; CHECK-NEXT: cmphi p2.b, p0/z, z2.b, z0.b -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b +; CHECK-NEXT: uqadd z0.b, z1.b, z0.b +; CHECK-NEXT: mov z1.b, w1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cmphi p0.b, p0/z, z1.b, z0.b ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv16i1.i8(i8 %index, i8 %TC) ret %active.lane.mask @@ -97,15 +95,12 @@ ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: add z0.h, z1.h, z0.h -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.h, w1 +; CHECK-NEXT: mov z1.h, w1 +; CHECK-NEXT: umin z0.h, z0.h, #255 ; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: and z2.h, z2.h, #0xff -; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: cmphi p2.h, p0/z, z2.h, z1.h -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b +; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: cmphi p0.h, p0/z, z1.h, z0.h ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv8i1.i8(i8 %index, i8 %TC) ret %active.lane.mask @@ -119,15 +114,12 @@ ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: and z1.s, z1.s, #0xff ; CHECK-NEXT: add z0.s, z1.s, z0.s -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.s, w1 +; CHECK-NEXT: mov z1.s, w1 +; CHECK-NEXT: umin z0.s, z0.s, #255 ; CHECK-NEXT: and z1.s, z1.s, #0xff -; CHECK-NEXT: and z2.s, z2.s, #0xff -; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, z0.s -; CHECK-NEXT: cmphi p2.s, p0/z, z2.s, z1.s -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b +; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: cmphi p0.s, p0/z, z1.s, z0.s ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv4i1.i8(i8 %index, i8 %TC) ret %active.lane.mask @@ -144,14 +136,11 @@ ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: mov z2.d, x1 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z1.d, z1.d, #0xff +; CHECK-NEXT: umin z0.d, z0.d, #255 ; CHECK-NEXT: and z2.d, z2.d, #0xff -; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, z0.d -; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z1.d -; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: and p0.b, p0/z, p1.b, p2.b +; CHECK-NEXT: and z0.d, z0.d, #0xff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv2i1.i8(i8 %index, i8 %TC) ret %active.lane.mask @@ -165,8 +154,6 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG @@ -174,60 +161,39 @@ ; CHECK-NEXT: index z0.s, #0, #1 ; CHECK-NEXT: mov z3.s, w0 ; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.s, w1 ; CHECK-NEXT: incw z1.s -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: add z4.s, z3.s, z0.s +; CHECK-NEXT: uqadd z5.s, z3.s, z0.s ; CHECK-NEXT: incw z2.s, all, mul #2 -; CHECK-NEXT: add z5.s, z3.s, z1.s +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: cmphi p1.s, p0/z, z4.s, z5.s +; CHECK-NEXT: uqadd z5.s, z3.s, z1.s +; CHECK-NEXT: cmphi p2.s, p0/z, z4.s, z5.s +; CHECK-NEXT: uqadd z5.s, z3.s, z2.s ; CHECK-NEXT: incw z6.s, all, mul #2 -; CHECK-NEXT: cmphi p1.s, p0/z, z3.s, z4.s -; CHECK-NEXT: cmphi p2.s, p0/z, z3.s, z5.s -; CHECK-NEXT: add z7.s, z3.s, z2.s -; CHECK-NEXT: add z25.s, z3.s, z6.s -; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h -; CHECK-NEXT: cmphi p2.s, p0/z, z3.s, z7.s -; CHECK-NEXT: cmphi p4.s, p0/z, z3.s, z25.s -; CHECK-NEXT: mov z24.s, w1 -; CHECK-NEXT: uzp1 p2.h, p2.h, p4.h ; CHECK-NEXT: incw z0.s, all, mul #4 +; CHECK-NEXT: cmphi p3.s, p0/z, z4.s, z5.s +; CHECK-NEXT: uqadd z5.s, z3.s, z6.s ; CHECK-NEXT: incw z1.s, all, mul #4 +; CHECK-NEXT: cmphi p4.s, p0/z, z4.s, z5.s +; CHECK-NEXT: uqadd z0.s, z3.s, z0.s +; CHECK-NEXT: uqadd z1.s, z3.s, z1.s ; CHECK-NEXT: incw z2.s, all, mul #4 ; CHECK-NEXT: incw z6.s, all, mul #4 -; CHECK-NEXT: cmphi p3.s, p0/z, z24.s, z5.s -; CHECK-NEXT: cmphi p5.s, p0/z, z24.s, z4.s -; CHECK-NEXT: cmphi p4.s, p0/z, z24.s, z7.s -; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b -; CHECK-NEXT: cmphi p2.s, p0/z, z24.s, z25.s -; CHECK-NEXT: add z0.s, z3.s, z0.s -; CHECK-NEXT: add z1.s, z3.s, z1.s -; CHECK-NEXT: add z2.s, z3.s, z2.s -; CHECK-NEXT: add z4.s, z3.s, z6.s -; CHECK-NEXT: uzp1 p3.h, p5.h, p3.h -; CHECK-NEXT: uzp1 p2.h, p4.h, p2.h -; CHECK-NEXT: cmphi p4.s, p0/z, z3.s, z0.s -; CHECK-NEXT: cmphi p5.s, p0/z, z3.s, z1.s -; CHECK-NEXT: cmphi p6.s, p0/z, z3.s, z2.s -; CHECK-NEXT: cmphi p7.s, p0/z, z3.s, z4.s -; CHECK-NEXT: uzp1 p4.h, p4.h, p5.h -; CHECK-NEXT: uzp1 p5.h, p6.h, p7.h -; CHECK-NEXT: uzp1 p2.b, p3.b, p2.b -; CHECK-NEXT: uzp1 p3.b, p4.b, p5.b -; CHECK-NEXT: cmphi p4.s, p0/z, z24.s, z0.s -; CHECK-NEXT: cmphi p5.s, p0/z, z24.s, z1.s -; CHECK-NEXT: ptrue p6.b -; CHECK-NEXT: uzp1 p4.h, p4.h, p5.h -; CHECK-NEXT: cmphi p5.s, p0/z, z24.s, z2.s -; CHECK-NEXT: cmphi p0.s, p0/z, z24.s, z4.s -; CHECK-NEXT: not p1.b, p6/z, p1.b -; CHECK-NEXT: uzp1 p0.h, p5.h, p0.h -; CHECK-NEXT: not p3.b, p6/z, p3.b -; CHECK-NEXT: uzp1 p4.b, p4.b, p0.b -; CHECK-NEXT: and p0.b, p6/z, p1.b, p2.b -; CHECK-NEXT: and p1.b, p6/z, p3.b, p4.b -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h +; CHECK-NEXT: uzp1 p2.h, p3.h, p4.h +; CHECK-NEXT: cmphi p3.s, p0/z, z4.s, z0.s +; CHECK-NEXT: cmphi p4.s, p0/z, z4.s, z1.s +; CHECK-NEXT: uqadd z0.s, z3.s, z2.s +; CHECK-NEXT: uqadd z1.s, z3.s, z6.s +; CHECK-NEXT: cmphi p5.s, p0/z, z4.s, z0.s +; CHECK-NEXT: cmphi p0.s, p0/z, z4.s, z1.s +; CHECK-NEXT: uzp1 p3.h, p3.h, p4.h +; CHECK-NEXT: uzp1 p4.h, p5.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p2.b +; CHECK-NEXT: uzp1 p1.b, p3.b, p4.b ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -241,135 +207,90 @@ ; CHECK-LABEL: lane_mask_nxv32i1_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG -; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z2.d, #0, #1 -; CHECK-NEXT: mov z0.d, x0 -; CHECK-NEXT: mov z3.d, z2.d -; CHECK-NEXT: mov z4.d, z2.d -; CHECK-NEXT: incd z3.d -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: mov z7.d, z3.d +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: mov z3.d, x0 +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: incd z7.d, all, mul #2 -; CHECK-NEXT: add z5.d, z0.d, z2.d -; CHECK-NEXT: add z6.d, z0.d, z3.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, x1 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: uqadd z5.d, z3.d, z0.d +; CHECK-NEXT: uqadd z6.d, z3.d, z1.d +; CHECK-NEXT: cmphi p1.d, p0/z, z4.d, z5.d +; CHECK-NEXT: mov z5.d, z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: cmphi p2.d, p0/z, z4.d, z6.d +; CHECK-NEXT: uqadd z6.d, z3.d, z2.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: incd z5.d, all, mul #2 +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: cmphi p2.d, p0/z, z4.d, z6.d +; CHECK-NEXT: uqadd z6.d, z3.d, z5.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z6.d +; CHECK-NEXT: uqadd z6.d, z3.d, z7.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z4.d -; CHECK-NEXT: mov z29.d, z7.d -; CHECK-NEXT: cmphi p1.d, p0/z, z0.d, z5.d -; CHECK-NEXT: cmphi p2.d, p0/z, z0.d, z6.d -; CHECK-NEXT: add z24.d, z0.d, z4.d -; CHECK-NEXT: add z27.d, z0.d, z7.d +; CHECK-NEXT: incd z24.d, all, mul #4 +; CHECK-NEXT: mov z26.d, z5.d +; CHECK-NEXT: cmphi p4.d, p0/z, z4.d, z6.d +; CHECK-NEXT: uqadd z6.d, z3.d, z24.d ; CHECK-NEXT: incd z25.d, all, mul #4 +; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z6.d +; CHECK-NEXT: uqadd z6.d, z3.d, z25.d ; CHECK-NEXT: incd z26.d, all, mul #4 -; CHECK-NEXT: incd z28.d, all, mul #4 -; CHECK-NEXT: incd z29.d, all, mul #4 -; CHECK-NEXT: cmphi p3.d, p0/z, z0.d, z24.d -; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s -; CHECK-NEXT: cmphi p2.d, p0/z, z0.d, z27.d -; CHECK-NEXT: add z30.d, z0.d, z25.d -; CHECK-NEXT: add z31.d, z0.d, z26.d -; CHECK-NEXT: add z8.d, z0.d, z28.d -; CHECK-NEXT: add z9.d, z0.d, z29.d -; CHECK-NEXT: uzp1 p2.s, p3.s, p2.s -; CHECK-NEXT: cmphi p3.d, p0/z, z0.d, z30.d -; CHECK-NEXT: cmphi p4.d, p0/z, z0.d, z31.d -; CHECK-NEXT: cmphi p5.d, p0/z, z0.d, z8.d -; CHECK-NEXT: cmphi p6.d, p0/z, z0.d, z9.d -; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s -; CHECK-NEXT: uzp1 p4.s, p5.s, p6.s -; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h -; CHECK-NEXT: uzp1 p2.h, p3.h, p4.h -; CHECK-NEXT: mov z1.d, x1 -; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b -; CHECK-NEXT: cmphi p2.d, p0/z, z1.d, z6.d -; CHECK-NEXT: cmphi p3.d, p0/z, z1.d, z5.d -; CHECK-NEXT: cmphi p4.d, p0/z, z1.d, z24.d -; CHECK-NEXT: cmphi p5.d, p0/z, z1.d, z27.d -; CHECK-NEXT: uzp1 p2.s, p3.s, p2.s -; CHECK-NEXT: uzp1 p3.s, p4.s, p5.s -; CHECK-NEXT: cmphi p4.d, p0/z, z1.d, z30.d -; CHECK-NEXT: cmphi p5.d, p0/z, z1.d, z31.d -; CHECK-NEXT: cmphi p6.d, p0/z, z1.d, z8.d -; CHECK-NEXT: cmphi p7.d, p0/z, z1.d, z9.d +; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z6.d +; CHECK-NEXT: uqadd z6.d, z3.d, z26.d +; CHECK-NEXT: uzp1 p2.s, p2.s, p3.s +; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z6.d +; CHECK-NEXT: incd z0.d, all, mul #8 +; CHECK-NEXT: incd z1.d, all, mul #8 +; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s +; CHECK-NEXT: uzp1 p3.s, p6.s, p3.s +; CHECK-NEXT: uqadd z0.d, z3.d, z0.d +; CHECK-NEXT: uqadd z1.d, z3.d, z1.d ; CHECK-NEXT: incd z2.d, all, mul #8 -; CHECK-NEXT: incd z3.d, all, mul #8 -; CHECK-NEXT: incd z4.d, all, mul #8 +; CHECK-NEXT: incd z5.d, all, mul #8 +; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h +; CHECK-NEXT: uzp1 p2.h, p4.h, p3.h +; CHECK-NEXT: cmphi p3.d, p0/z, z4.d, z0.d +; CHECK-NEXT: cmphi p4.d, p0/z, z4.d, z1.d +; CHECK-NEXT: uqadd z0.d, z3.d, z2.d +; CHECK-NEXT: uqadd z1.d, z3.d, z5.d ; CHECK-NEXT: incd z7.d, all, mul #8 -; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s -; CHECK-NEXT: uzp1 p5.s, p6.s, p7.s -; CHECK-NEXT: add z2.d, z0.d, z2.d -; CHECK-NEXT: add z3.d, z0.d, z3.d -; CHECK-NEXT: add z4.d, z0.d, z4.d -; CHECK-NEXT: add z5.d, z0.d, z7.d +; CHECK-NEXT: incd z24.d, all, mul #8 +; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z0.d +; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z1.d +; CHECK-NEXT: uqadd z0.d, z3.d, z7.d +; CHECK-NEXT: uqadd z1.d, z3.d, z24.d ; CHECK-NEXT: incd z25.d, all, mul #8 ; CHECK-NEXT: incd z26.d, all, mul #8 -; CHECK-NEXT: incd z28.d, all, mul #8 -; CHECK-NEXT: incd z29.d, all, mul #8 -; CHECK-NEXT: uzp1 p2.h, p2.h, p3.h -; CHECK-NEXT: uzp1 p3.h, p4.h, p5.h -; CHECK-NEXT: cmphi p4.d, p0/z, z0.d, z2.d -; CHECK-NEXT: cmphi p5.d, p0/z, z0.d, z3.d -; CHECK-NEXT: cmphi p6.d, p0/z, z0.d, z4.d -; CHECK-NEXT: cmphi p7.d, p0/z, z0.d, z5.d -; CHECK-NEXT: add z6.d, z0.d, z25.d -; CHECK-NEXT: add z7.d, z0.d, z26.d -; CHECK-NEXT: add z24.d, z0.d, z28.d -; CHECK-NEXT: add z25.d, z0.d, z29.d -; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s -; CHECK-NEXT: uzp1 p5.s, p6.s, p7.s -; CHECK-NEXT: cmphi p6.d, p0/z, z0.d, z6.d -; CHECK-NEXT: cmphi p7.d, p0/z, z0.d, z7.d -; CHECK-NEXT: cmphi p8.d, p0/z, z0.d, z24.d -; CHECK-NEXT: cmphi p9.d, p0/z, z0.d, z25.d -; CHECK-NEXT: uzp1 p6.s, p6.s, p7.s -; CHECK-NEXT: uzp1 p7.s, p8.s, p9.s -; CHECK-NEXT: uzp1 p4.h, p4.h, p5.h -; CHECK-NEXT: uzp1 p5.h, p6.h, p7.h -; CHECK-NEXT: uzp1 p2.b, p2.b, p3.b -; CHECK-NEXT: uzp1 p3.b, p4.b, p5.b -; CHECK-NEXT: cmphi p4.d, p0/z, z1.d, z2.d -; CHECK-NEXT: cmphi p5.d, p0/z, z1.d, z3.d -; CHECK-NEXT: cmphi p6.d, p0/z, z1.d, z4.d -; CHECK-NEXT: cmphi p7.d, p0/z, z1.d, z5.d -; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s -; CHECK-NEXT: uzp1 p5.s, p6.s, p7.s -; CHECK-NEXT: cmphi p6.d, p0/z, z1.d, z6.d -; CHECK-NEXT: cmphi p7.d, p0/z, z1.d, z7.d -; CHECK-NEXT: uzp1 p4.h, p4.h, p5.h -; CHECK-NEXT: uzp1 p5.s, p6.s, p7.s -; CHECK-NEXT: cmphi p6.d, p0/z, z1.d, z24.d -; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z25.d -; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.s, p6.s, p0.s -; CHECK-NEXT: ptrue p6.b -; CHECK-NEXT: uzp1 p0.h, p5.h, p0.h -; CHECK-NEXT: not p1.b, p6/z, p1.b -; CHECK-NEXT: not p3.b, p6/z, p3.b -; CHECK-NEXT: uzp1 p4.b, p4.b, p0.b -; CHECK-NEXT: and p0.b, p6/z, p1.b, p2.b -; CHECK-NEXT: and p1.b, p6/z, p3.b, p4.b -; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s +; CHECK-NEXT: uzp1 p4.s, p5.s, p6.s +; CHECK-NEXT: cmphi p5.d, p0/z, z4.d, z0.d +; CHECK-NEXT: cmphi p6.d, p0/z, z4.d, z1.d +; CHECK-NEXT: uqadd z0.d, z3.d, z25.d +; CHECK-NEXT: uqadd z1.d, z3.d, z26.d +; CHECK-NEXT: cmphi p7.d, p0/z, z4.d, z0.d +; CHECK-NEXT: cmphi p0.d, p0/z, z4.d, z1.d +; CHECK-NEXT: uzp1 p5.s, p5.s, p6.s +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: uzp1 p3.h, p3.h, p4.h +; CHECK-NEXT: uzp1 p4.h, p5.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p2.b +; CHECK-NEXT: uzp1 p1.b, p3.b, p4.b ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 %index, i64 %TC) @@ -379,31 +300,17 @@ define @lane_mask_nxv32i1_i8(i8 %index, i8 %TC) { ; CHECK-LABEL: lane_mask_nxv32i1_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: index z1.b, #0, #1 -; CHECK-NEXT: mov z0.b, w8 +; CHECK-NEXT: index z0.b, #0, #1 +; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: add z0.b, z1.b, z0.b -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: add z4.b, z2.b, z1.b -; CHECK-NEXT: add z0.b, z2.b, z0.b +; CHECK-NEXT: add z1.b, z0.b, z1.b ; CHECK-NEXT: mov z3.b, w1 -; CHECK-NEXT: cmphi p2.b, p1/z, z2.b, z4.b -; CHECK-NEXT: cmphi p3.b, p1/z, z2.b, z0.b -; CHECK-NEXT: cmphi p0.b, p1/z, z3.b, z4.b -; CHECK-NEXT: not p2.b, p1/z, p2.b -; CHECK-NEXT: cmphi p4.b, p1/z, z3.b, z0.b -; CHECK-NEXT: not p3.b, p1/z, p3.b -; CHECK-NEXT: and p0.b, p1/z, p2.b, p0.b -; CHECK-NEXT: and p1.b, p1/z, p3.b, p4.b -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: uqadd z0.b, z2.b, z0.b +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: uqadd z1.b, z2.b, z1.b +; CHECK-NEXT: cmphi p0.b, p1/z, z3.b, z0.b +; CHECK-NEXT: cmphi p1.b, p1/z, z3.b, z1.b ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv32i1.i8(i8 %index, i8 %TC) ret %active.lane.mask diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll @@ -15,13 +15,9 @@ ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q2, q0, r1 -; CHECK-NEXT: vdup.32 q3, r1 -; CHECK-NEXT: vcmp.u32 hi, q3, q2 +; CHECK-NEXT: vqadd.u32 q2, q0, r1 ; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vcmpt.u32 hi, q1, q2 +; CHECK-NEXT: vptt.u32 hi, q1, q2 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vaddvat.u32 r2, q2 ; CHECK-NEXT: le lr, .LBB0_1 diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -4,53 +4,59 @@ define <2 x i64> @v2i64(i32 %index, i32 %TC, <2 x i64> %V1, <2 x i64> %V2) { ; CHECK-LABEL: v2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov q0[2], q0[0], r0, r0 -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vmov q1[2], q1[0], r0, r0 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vmov r0, r4, d3 ; CHECK-NEXT: vmov q2[2], q2[0], r1, r1 -; CHECK-NEXT: vmov r0, r12, d1 -; CHECK-NEXT: vmov lr, s0 -; CHECK-NEXT: adds r0, #1 -; CHECK-NEXT: vmov q0[2], q0[0], lr, r0 -; CHECK-NEXT: adc r12, r12, #0 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vldr d1, [sp, #16] -; CHECK-NEXT: eors r0, r4 -; CHECK-NEXT: orrs.w r0, r0, r12 -; CHECK-NEXT: vmov r1, r0, d3 -; CHECK-NEXT: cset r12, eq -; CHECK-NEXT: subs r1, r4, r1 -; CHECK-NEXT: sbcs.w r0, r5, r0 -; CHECK-NEXT: vmov r1, r5, d0 -; CHECK-NEXT: cset r0, lo -; CHECK-NEXT: vmov d0, r2, r3 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: and.w r0, r0, r12 -; CHECK-NEXT: rsb.w r12, r0, #0 -; CHECK-NEXT: vmov r4, r0, d2 -; CHECK-NEXT: subs r4, r1, r4 -; CHECK-NEXT: sbcs.w r0, r5, r0 +; CHECK-NEXT: vmov lr, r12, d2 +; CHECK-NEXT: adds r6, r0, #1 +; CHECK-NEXT: adc r4, r4, #0 +; CHECK-NEXT: subs.w r0, lr, #-1 +; CHECK-NEXT: sbcs r0, r12, #0 +; CHECK-NEXT: vmov q1[2], q1[0], lr, r6 ; CHECK-NEXT: cset r0, lo +; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: teq.w r1, lr -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: ands r0, r1 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: mov.w r0, #0 +; CHECK-NEXT: csetm r12, ne +; CHECK-NEXT: subs.w r6, r6, #-1 +; CHECK-NEXT: sbcs r6, r4, #0 +; CHECK-NEXT: bfi r5, r12, #0, #8 +; CHECK-NEXT: cset r6, lo +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: csetm r6, ne +; CHECK-NEXT: bfi r5, r6, #8, #8 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: vpsel q1, q1, q0 +; CHECK-NEXT: vand q0, q2, q0 +; CHECK-NEXT: vmov r1, r4, d0 +; CHECK-NEXT: vmov r6, r5, d2 +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: subs r1, r6, r1 +; CHECK-NEXT: sbcs.w r1, r5, r4 +; CHECK-NEXT: vmov r5, r4, d1 +; CHECK-NEXT: cset r1, lo +; CHECK-NEXT: vldr d1, [sp, #16] +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #0, #8 +; CHECK-NEXT: vmov r1, r6, d3 +; CHECK-NEXT: subs r1, r1, r5 +; CHECK-NEXT: sbcs.w r1, r6, r4 +; CHECK-NEXT: cset r1, lo +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: csetm r1, ne +; CHECK-NEXT: bfi r0, r1, #8, #8 +; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: add r0, sp, #24 -; CHECK-NEXT: bfi r1, r12, #8, #8 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC) %select = select <2 x i1> %active.lane.mask, <2 x i64> %V1, <2 x i64> %V2 ret <2 x i64> %select @@ -60,15 +66,11 @@ ; CHECK-LABEL: v4i32: ; CHECK: @ %bb.0: ; CHECK-NEXT: adr.w r12, .LCPI1_0 -; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vqadd.u32 q0, q0, r0 ; CHECK-NEXT: add r0, sp, #8 ; CHECK-NEXT: vcmp.u32 hi, q1, q0 -; CHECK-NEXT: vdup.32 q1, r1 -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.u32 hi, q1, q0 ; CHECK-NEXT: vldr d1, [sp] ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov d0, r2, r3 @@ -91,41 +93,31 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) { ; CHECK-LABEL: v7i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: adr r3, .LCPI2_0 -; CHECK-NEXT: vdup.32 q1, r1 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: ldr.w r12, [sp, #40] +; CHECK-NEXT: vdup.32 q3, r2 ; CHECK-NEXT: ldr r3, [sp, #32] -; CHECK-NEXT: vadd.i32 q2, q0, r1 -; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vcmp.u32 hi, q1, q2 -; CHECK-NEXT: ldr r2, [sp, #40] -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.u32 hi, q0, q2 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: ldr r2, [sp, #44] +; CHECK-NEXT: adr r2, .LCPI2_1 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r12 +; CHECK-NEXT: ldr.w r12, [sp, #44] ; CHECK-NEXT: ldr r3, [sp, #36] -; CHECK-NEXT: vmov q2[3], q2[1], r3, r2 -; CHECK-NEXT: ldr r2, [sp, #8] +; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 +; CHECK-NEXT: ldr.w r12, [sp, #8] ; CHECK-NEXT: ldr r3, [sp] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: ldr r2, [sp, #12] +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: ldr.w r12, [sp, #12] ; CHECK-NEXT: ldr r3, [sp, #4] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: adr r2, .LCPI2_1 -; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vstrw.32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r2] -; CHECK-NEXT: movw r2, #4095 -; CHECK-NEXT: vadd.i32 q2, q2, r1 -; CHECK-NEXT: vcmp.u32 hi, q1, q2 -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: eors r1, r2 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: adr r3, .LCPI2_0 +; CHECK-NEXT: vldrw.u32 q2, [r3] +; CHECK-NEXT: vqadd.u32 q2, q2, r1 +; CHECK-NEXT: vcmp.u32 hi, q3, q2 +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r2] ; CHECK-NEXT: ldr r2, [sp, #48] -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vqadd.u32 q0, q0, r1 ; CHECK-NEXT: ldr r1, [sp, #52] -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.u32 hi, q0, q2 +; CHECK-NEXT: vcmp.u32 hi, q3, q0 ; CHECK-NEXT: vmov.32 q0[1], r1 ; CHECK-NEXT: ldr r1, [sp, #56] ; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 @@ -162,15 +154,15 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) { ; CHECK-LABEL: v8i16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr.w r12, .LCPI3_0 -; CHECK-NEXT: vdup.32 q5, r1 +; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: vmov.i8 q1, #0x0 -; CHECK-NEXT: vmov.i8 q2, #0xff -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vcmp.u32 hi, q5, q3 -; CHECK-NEXT: vpsel q4, q2, q1 +; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vqadd.u32 q0, q0, r0 +; CHECK-NEXT: vcmp.u32 hi, q1, q0 +; CHECK-NEXT: vpsel q4, q3, q2 ; CHECK-NEXT: vmov r1, r12, d8 ; CHECK-NEXT: vmov.16 q0[0], r1 ; CHECK-NEXT: vmov.16 q0[1], r12 @@ -179,44 +171,24 @@ ; CHECK-NEXT: adr r1, .LCPI3_1 ; CHECK-NEXT: vldrw.u32 q4, [r1] ; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vcmp.u32 hi, q5, q4 -; CHECK-NEXT: vpsel q5, q2, q1 -; CHECK-NEXT: vmov r1, r12, d10 -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov r1, r12, d11 -; CHECK-NEXT: vdup.32 q5, r0 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vcmp.u32 hi, q5, q3 -; CHECK-NEXT: vmov.16 q0[7], r12 -; CHECK-NEXT: vpsel q6, q2, q1 -; CHECK-NEXT: vcmp.u32 hi, q5, q4 -; CHECK-NEXT: vmov r0, r1, d12 -; CHECK-NEXT: vpsel q1, q2, q1 -; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.16 q3[1], r1 -; CHECK-NEXT: vmov r0, r1, d13 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.16 q3[3], r1 +; CHECK-NEXT: vqadd.u32 q4, q4, r0 +; CHECK-NEXT: vcmp.u32 hi, q1, q4 +; CHECK-NEXT: vpsel q1, q3, q2 ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov.16 q3[5], r1 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 ; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: add r0, sp, #56 -; CHECK-NEXT: vmov.16 q3[7], r1 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: add r0, sp, #24 +; CHECK-NEXT: vmov.16 q0[7], r1 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vcmp.i16 ne, q3, zr -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.i16 ne, q0, zr -; CHECK-NEXT: vldr d1, [sp, #48] +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vldr d1, [sp, #16] ; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: @@ -238,175 +210,99 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-LABEL: v16i8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr.w r12, .LCPI4_0 -; CHECK-NEXT: vdup.32 q7, r1 +; CHECK-NEXT: vdup.32 q3, r1 ; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: vmov.i8 q5, #0x0 -; CHECK-NEXT: vmov.i8 q4, #0xff -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vcmp.u32 hi, q7, q1 -; CHECK-NEXT: vpsel q0, q4, q5 -; CHECK-NEXT: vmov r1, r12, d0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vqadd.u32 q0, q0, r0 +; CHECK-NEXT: vcmp.u32 hi, q3, q0 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q4, q1, q0 +; CHECK-NEXT: vmov r1, r12, d8 ; CHECK-NEXT: vmov.16 q2[0], r1 ; CHECK-NEXT: vmov.16 q2[1], r12 -; CHECK-NEXT: vmov r1, r12, d1 +; CHECK-NEXT: vmov r1, r12, d9 ; CHECK-NEXT: vmov.16 q2[2], r1 ; CHECK-NEXT: adr r1, .LCPI4_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q4, [r1] ; CHECK-NEXT: vmov.16 q2[3], r12 -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vcmp.u32 hi, q7, q3 -; CHECK-NEXT: vpsel q0, q4, q5 -; CHECK-NEXT: vmov r1, r12, d0 +; CHECK-NEXT: vqadd.u32 q4, q4, r0 +; CHECK-NEXT: vcmp.u32 hi, q3, q4 +; CHECK-NEXT: vpsel q4, q1, q0 +; CHECK-NEXT: vmov r1, r12, d8 ; CHECK-NEXT: vmov.16 q2[4], r1 ; CHECK-NEXT: vmov.16 q2[5], r12 -; CHECK-NEXT: vmov r1, r12, d1 +; CHECK-NEXT: vmov r1, r12, d9 ; CHECK-NEXT: vmov.16 q2[6], r1 ; CHECK-NEXT: vmov.16 q2[7], r12 ; CHECK-NEXT: vcmp.i16 ne, q2, zr -; CHECK-NEXT: vpsel q0, q4, q5 -; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vpsel q4, q1, q0 +; CHECK-NEXT: vmov.u16 r1, q4[0] ; CHECK-NEXT: vmov.8 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.u16 r1, q4[1] ; CHECK-NEXT: vmov.8 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r1, q4[2] ; CHECK-NEXT: vmov.8 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r1, q4[3] ; CHECK-NEXT: vmov.8 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov.u16 r1, q4[4] ; CHECK-NEXT: vmov.8 q2[4], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.u16 r1, q4[5] ; CHECK-NEXT: vmov.8 q2[5], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.u16 r1, q4[6] ; CHECK-NEXT: vmov.8 q2[6], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r1, q4[7] ; CHECK-NEXT: vmov.8 q2[7], r1 ; CHECK-NEXT: adr r1, .LCPI4_2 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vcmp.u32 hi, q7, q0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vpsel q6, q4, q5 -; CHECK-NEXT: vmov r1, r12, d12 -; CHECK-NEXT: vmov.16 q0[0], r1 -; CHECK-NEXT: vmov.16 q0[1], r12 -; CHECK-NEXT: vmov r1, r12, d13 -; CHECK-NEXT: vmov.16 q0[2], r1 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vqadd.u32 q4, q4, r0 +; CHECK-NEXT: vcmp.u32 hi, q3, q4 +; CHECK-NEXT: vpsel q5, q1, q0 +; CHECK-NEXT: vmov r1, r12, d10 +; CHECK-NEXT: vmov.16 q4[0], r1 +; CHECK-NEXT: vmov.16 q4[1], r12 +; CHECK-NEXT: vmov r1, r12, d11 +; CHECK-NEXT: vmov.16 q4[2], r1 ; CHECK-NEXT: adr r1, .LCPI4_3 -; CHECK-NEXT: vldrw.u32 q6, [r1] -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vadd.i32 q6, q6, r0 -; CHECK-NEXT: vcmp.u32 hi, q7, q6 -; CHECK-NEXT: vpsel q7, q4, q5 -; CHECK-NEXT: vmov r1, r12, d14 -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov r1, r12, d15 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: vmov.16 q0[7], r12 -; CHECK-NEXT: vcmp.i16 ne, q0, zr -; CHECK-NEXT: vpsel q0, q4, q5 -; CHECK-NEXT: vcmp.u32 hi, q7, q1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vpsel q1, q4, q5 -; CHECK-NEXT: vmov.8 q2[8], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.8 q2[9], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.8 q2[10], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.8 q2[11], r1 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.8 q2[12], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.8 q2[13], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.8 q2[14], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.8 q2[15], r1 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vcmp.u32 hi, q7, q3 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vpsel q1, q4, q5 -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vcmp.i16 ne, q0, zr -; CHECK-NEXT: vpsel q0, q4, q5 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.8 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.8 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.8 q3[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.8 q3[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.8 q3[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.8 q3[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.8 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q3[7], r0 -; CHECK-NEXT: vcmp.u32 hi, q7, q0 -; CHECK-NEXT: vpsel q1, q4, q5 -; CHECK-NEXT: vcmp.u32 hi, q7, q6 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vpsel q1, q4, q5 -; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vcmp.i16 ne, q0, zr -; CHECK-NEXT: vpsel q0, q4, q5 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vmov.16 q4[3], r12 +; CHECK-NEXT: vqadd.u32 q5, q5, r0 +; CHECK-NEXT: vcmp.u32 hi, q3, q5 +; CHECK-NEXT: vpsel q3, q1, q0 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.16 q4[5], r1 +; CHECK-NEXT: vmov r0, r1, d7 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.16 q4[7], r1 +; CHECK-NEXT: vcmp.i16 ne, q4, zr +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.8 q3[8], r0 +; CHECK-NEXT: vmov.8 q2[8], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.8 q3[9], r0 +; CHECK-NEXT: vmov.8 q2[9], r0 ; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.8 q3[10], r0 +; CHECK-NEXT: vmov.8 q2[10], r0 ; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.8 q3[11], r0 +; CHECK-NEXT: vmov.8 q2[11], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.8 q3[12], r0 +; CHECK-NEXT: vmov.8 q2[12], r0 ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.8 q3[13], r0 +; CHECK-NEXT: vmov.8 q2[13], r0 ; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.8 q3[14], r0 +; CHECK-NEXT: vmov.8 q2[14], r0 ; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.8 q3[15], r0 -; CHECK-NEXT: add r0, sp, #88 -; CHECK-NEXT: vcmp.i8 ne, q3, zr -; CHECK-NEXT: vldr d1, [sp, #80] -; CHECK-NEXT: vpnot +; CHECK-NEXT: vmov.8 q2[15], r0 +; CHECK-NEXT: add r0, sp, #40 +; CHECK-NEXT: vldr d1, [sp, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.i8 ne, q2, zr +; CHECK-NEXT: vcmp.i8 ne, q2, zr ; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll --- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -366,23 +366,23 @@ ; CHECK-NEXT: @ implicit-def: $r8 ; CHECK-NEXT: @ implicit-def: $r5 ; CHECK-NEXT: @ implicit-def: $r10 -; CHECK-NEXT: strd r3, r0, [sp] @ 8-byte Folded Spill +; CHECK-NEXT: strd r3, r0, [sp, #16] @ 8-byte Folded Spill ; CHECK-NEXT: add.w r6, r7, r2, lsr #1 ; CHECK-NEXT: add.w r1, r1, r2, lsr #1 ; CHECK-NEXT: movw r2, #65532 ; CHECK-NEXT: vdup.32 q6, r6 ; CHECK-NEXT: movt r2, #32767 ; CHECK-NEXT: ands r1, r2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w r1, r7, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adr r1, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adr r1, .LCPI1_1 ; CHECK-NEXT: vldrw.u32 q5, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r12 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q3, q0, r12 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 @@ -404,7 +404,7 @@ ; CHECK-NEXT: subs r1, r2, r1 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: add.w r10, r0, #7 -; CHECK-NEXT: ldrd r3, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r3, r0, [sp, #16] @ 8-byte Folded Reload ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 ; CHECK-NEXT: adds r5, #2 @@ -431,7 +431,8 @@ ; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: mov r7, r12 ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: ldrd r3, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r3, r0, [sp, #16] @ 8-byte Folded Reload +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload ; CHECK-NEXT: mov r12, r7 ; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: mov r7, r10 @@ -454,13 +455,13 @@ ; CHECK-NEXT: @ %bb.9: @ %for.body13.us51.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 ; CHECK-NEXT: movw r2, :lower16:a -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: movt r2, :upper16:a ; CHECK-NEXT: str r1, [r2] ; CHECK-NEXT: movw r2, :lower16:b ; CHECK-NEXT: movt r2, :upper16:b ; CHECK-NEXT: str r1, [r2] -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: dlstp.32 lr, r6 ; CHECK-NEXT: .LBB1_10: @ %vector.body111 ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 @@ -474,21 +475,17 @@ ; CHECK-NEXT: b .LBB1_13 ; CHECK-NEXT: .LBB1_11: @ %vector.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: .LBB1_12: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q2, q5, r1 -; CHECK-NEXT: vdup.32 q3, r1 -; CHECK-NEXT: vcmp.u32 hi, q3, q2 +; CHECK-NEXT: vqadd.u32 q2, q5, r1 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vpnot -; CHECK-NEXT: add.w r1, r1, #4 -; CHECK-NEXT: vpst -; CHECK-NEXT: vcmpt.u32 hi, q6, q2 +; CHECK-NEXT: vcmp.u32 hi, q6, q2 ; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: add.w r1, r1, #4 ; CHECK-NEXT: vadd.i32 q2, q2, r11 ; CHECK-NEXT: vadd.i32 q1, q1, q7 ; CHECK-NEXT: vpst @@ -509,7 +506,7 @@ ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: beq.w .LBB1_2 ; CHECK-NEXT: @ %bb.16: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: ldrd r3, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r3, r0, [sp, #16] @ 8-byte Folded Reload ; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: .LBB1_17: @ %for.body6.us60 ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1