diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12236,6 +12236,15 @@ return false; } +static bool isInstUsedByGEP(Instruction *I, unsigned Threshold) { + // Use a threshold to limit the amount of recursion. + if (!I->hasOneUse() || isa(I) || Threshold == 0) + return false; + Use &U = *I->use_begin(); + return isa(I) || + isInstUsedByGEP(cast(U.getUser()), Threshold - 1); +} + /// Check if sinking \p I's operands to I's basic block is profitable, because /// the operands can be folded into a target instruction, e.g. /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). @@ -12287,6 +12296,20 @@ switch (I->getOpcode()) { case Instruction::Sub: case Instruction::Add: { + unsigned OpsSize = Ops.size(); + if (isInstUsedByGEP(I, /*Threshold*/ 4)) { + for (unsigned J = 0; J < I->getNumOperands(); ++J) { + Use &U = I->getOperandUse(J); + if (isSplatShuffle(U.get())) { + Ops.push_back(&cast(U.get())->getOperandUse(0)); + Ops.push_back(&U); + } else if (auto *II = dyn_cast(U.get())) + if (II->getIntrinsicID() == Intrinsic::experimental_stepvector) + Ops.push_back(&U); + } + if (Ops.size() > OpsSize) + return true; + } if (!areExtractExts(I->getOperand(0), I->getOperand(1))) return false; diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll --- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll +++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll @@ -331,8 +331,527 @@ call void @llvm.masked.scatter.nxv4f16( %data, %gep.bc, i32 2, %pg) ret void } + +;; Check fold when step is outside the loop +;; The tests below are a simplification of the llvm-ir generated by: +;; void gather_stride16(float *dst, float *src1, float *src2, int n) { +;; for (int i = 0; i < n; i++) { +;; for (int j = 0; j < n; j++) { +;; dst[i + j] += src1[i + (j * (int)16)] * src2[i + j]; + +define void @gather_nxv4f32(float* %src, float* %dst, %pg) #0 { +; CHECK-LABEL: gather_nxv4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: index z0.s, #0, w9 +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: .LBB13_1: // %vector.ph +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB13_2 Depth 2 +; CHECK-NEXT: mov x10, xzr +; CHECK-NEXT: .LBB13_2: // %vector.body +; CHECK-NEXT: // Parent Loop BB13_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: add x11, x0, x8, lsl #2 +; CHECK-NEXT: add x11, x11, x10, lsl #6 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x11, z0.s, sxtw #2] +; CHECK-NEXT: st1w { z1.s }, p1, [x1, x10, lsl #2] +; CHECK-NEXT: add x10, x10, x9 +; CHECK-NEXT: cmp x10, #10 +; CHECK-NEXT: b.eq .LBB13_2 +; CHECK-NEXT: // %bb.3: // %for.cond1.cleanup +; CHECK-NEXT: // in Loop: Header=BB13_1 Depth=1 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #10 +; CHECK-NEXT: b.ne .LBB13_1 +; CHECK-NEXT: // %bb.4: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.ph + +vector.ph: + %index2 = phi i64 [ 0, %entry ], [ %index2.next, %for.cond1.cleanup ] + %splatinsertIdx2 = insertelement poison, i64 %index2, i64 0 + %splatIdx2 = shufflevector %splatinsertIdx2, poison, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %idx = phi i64 [ 0, %vector.ph ], [ %idx.next, %vector.body ] + %splatinsertIdx = insertelement poison, i64 %idx, i64 0 + %splatIdx = shufflevector %splatinsertIdx, poison, zeroinitializer + %splatinsert4 = insertelement poison, i64 4, i64 0 + %splat4 = shufflevector %splatinsert4, poison, zeroinitializer + %t0.0 = add %splatIdx, %step + %t0.1 = shl nsw %t0.0, %splat4 + %t0.2 = add nuw nsw %t0.1, %splatIdx2 + %t0.3 = getelementptr inbounds float, float* %src, %t0.2 + %wide.masked.gather = call @llvm.masked.gather.nxv4f32( %t0.3, i32 4, %pg, undef) + %t8 = getelementptr inbounds float, float* %dst, i64 %idx + %t9 = bitcast float* %t8 to * + store %wide.masked.gather, * %t9, align 4 + %t14 = call i64 @llvm.vscale.i64() + %t15 = shl nuw nsw i64 %t14, 3 + %idx.next = add nuw i64 %idx, %t15 + %t16 = icmp eq i64 %idx.next, 10 + br i1 %t16, label %vector.body, label %for.cond1.cleanup, !llvm.loop !0 + +for.cond1.cleanup: + %index2.next = add nuw nsw i64 %index2, 1 + %exitcond = icmp eq i64 %index2.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %vector.ph, !llvm.loop !0 + +for.cond.cleanup: ; preds = %middle.block, %for.body + ret void +} + +define void @scatter_nxv4i8(i8* %base, i64 %offset, %pg, %data, i64 %n) #0 { +; CHECK-LABEL: scatter_nxv4i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: index z1.s, #0, w9 +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: .LBB14_1: // %vector.ph +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB14_2 Depth 2 +; CHECK-NEXT: mov x10, xzr +; CHECK-NEXT: .LBB14_2: // %vector.body +; CHECK-NEXT: // Parent Loop BB14_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: add x11, x0, x8 +; CHECK-NEXT: add x11, x11, x10, lsl #4 +; CHECK-NEXT: add x10, x10, x9 +; CHECK-NEXT: cmp x10, #10 +; CHECK-NEXT: st1b { z0.s }, p0, [x11, z1.s, sxtw] +; CHECK-NEXT: b.eq .LBB14_2 +; CHECK-NEXT: // %bb.3: // %for.cond1.cleanup +; CHECK-NEXT: // in Loop: Header=BB14_1 Depth=1 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #10 +; CHECK-NEXT: b.ne .LBB14_1 +; CHECK-NEXT: // %bb.4: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.ph + +vector.ph: + %index2 = phi i64 [ 0, %entry ], [ %index2.next, %for.cond1.cleanup ] + %splatinsertIdx2 = insertelement poison, i64 %index2, i64 0 + %splatIdx2 = shufflevector %splatinsertIdx2, poison, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %.splatinsert = insertelement poison, i64 %index, i64 0 + %splatIdx = shufflevector %.splatinsert, poison, zeroinitializer + %splatinsert4 = insertelement poison, i64 4, i64 0 + %splat4 = shufflevector %splatinsert4, poison, zeroinitializer + %t0.0 = add %splatIdx, %step + %t0.1 = shl %t0.0, %splat4 ; With mul it does not fold + %t0.2 = add nuw nsw %t0.1, %splatIdx2 + %t0.3 = getelementptr i8, i8* %base, %t0.2 + call void @llvm.masked.scatter.nxv4i8( %data, %t0.3, i32 2, %pg) + %t14 = call i64 @llvm.vscale.i64() + %t15 = shl nuw nsw i64 %t14, 3 + %index.next = add nuw i64 %index, %t15 + %t16 = icmp eq i64 %index.next, 10 + br i1 %t16, label %vector.body, label %for.cond1.cleanup, !llvm.loop !0 + +for.cond1.cleanup: + %index2.next = add nuw nsw i64 %index2, 1 + %exitcond = icmp eq i64 %index2.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %vector.ph, !llvm.loop !0 + +for.cond.cleanup: ; preds = %middle.block, %for.body + ret void +} + +define void @mul_scatter_nxv4i8(i8* %base, i64 %offset, %pg, %data1, %data2, i64 %n) #0 { +; CHECK-LABEL: mul_scatter_nxv4i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: mov w10, #256 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: index z2.s, #0, w9 +; CHECK-NEXT: index z3.s, #0, w10 +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: .LBB15_1: // %vector.ph +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB15_2 Depth 2 +; CHECK-NEXT: mov x10, xzr +; CHECK-NEXT: .LBB15_2: // %vector.body +; CHECK-NEXT: // Parent Loop BB15_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: add x11, x0, x8 +; CHECK-NEXT: add x12, x11, x10, lsl #4 +; CHECK-NEXT: add x11, x11, x10, lsl #8 +; CHECK-NEXT: add x10, x10, x9 +; CHECK-NEXT: cmp x10, #10 +; CHECK-NEXT: st1b { z0.s }, p0, [x12, z2.s, sxtw] +; CHECK-NEXT: st1b { z1.s }, p0, [x11, z3.s, sxtw] +; CHECK-NEXT: b.eq .LBB15_2 +; CHECK-NEXT: // %bb.3: // %for.cond1.cleanup +; CHECK-NEXT: // in Loop: Header=BB15_1 Depth=1 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #10 +; CHECK-NEXT: b.ne .LBB15_1 +; CHECK-NEXT: // %bb.4: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.ph + +vector.ph: + %index2 = phi i64 [ 0, %entry ], [ %index2.next, %for.cond1.cleanup ] + %splatinsertIdx2 = insertelement poison, i64 %index2, i64 0 + %splatIdx2 = shufflevector %splatinsertIdx2, poison, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %.splatinsert = insertelement poison, i64 %index, i64 0 + %splatIdx = shufflevector %.splatinsert, poison, zeroinitializer + %splatinsert4 = insertelement poison, i64 4, i64 0 + %splat4 = shufflevector %splatinsert4, poison, zeroinitializer + %splatinsert8 = insertelement poison, i64 8, i64 0 + %splat8 = shufflevector %splatinsert8, poison, zeroinitializer + %t0.0 = add %splatIdx, %step + %t0.1 = shl %t0.0, %splat4 + %t0.2 = add nuw nsw %t0.1, %splatIdx2 + %t0.3 = getelementptr i8, i8* %base, %t0.2 + call void @llvm.masked.scatter.nxv4i8( %data1, %t0.3, i32 2, %pg) + ;second round use of step + %t1.0 = add %splatIdx, %step + %t1.1 = shl %t1.0, %splat8 + %t1.2 = add nuw nsw %t1.1, %splatIdx2 + %t1.3 = getelementptr i8, i8* %base, %t1.2 + call void @llvm.masked.scatter.nxv4i8( %data2, %t1.3, i32 2, %pg) + %t14 = call i64 @llvm.vscale.i64() + %t15 = shl nuw nsw i64 %t14, 3 + %index.next = add nuw i64 %index, %t15 + %t16 = icmp eq i64 %index.next, 10 + br i1 %t16, label %vector.body, label %for.cond1.cleanup, !llvm.loop !0 + +for.cond1.cleanup: + %index2.next = add nuw nsw i64 %index2, 1 + %exitcond = icmp eq i64 %index2.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %vector.ph, !llvm.loop !0 + +for.cond.cleanup: ; preds = %middle.block, %for.body + ret void +} + +; stepvector is hidden further behind GEP two adds and a shift. +define void @loop_scatter_f16_index_add_add_mul([8 x half]* %base, i64 %offset, i64 %offset2, %pg, %data) #0 { +; CHECK-LABEL: loop_scatter_f16_index_add_add_mul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w9, #128 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: index z1.s, #0, w9 +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: .LBB16_1: // %vector.ph +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB16_2 Depth 2 +; CHECK-NEXT: mov x10, xzr +; CHECK-NEXT: .LBB16_2: // %vector.body +; CHECK-NEXT: // Parent Loop BB16_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: add x11, x0, x8, lsl #7 +; CHECK-NEXT: add x11, x11, x10, lsl #7 +; CHECK-NEXT: add x10, x10, x9 +; CHECK-NEXT: cmp x10, #10 +; CHECK-NEXT: st1h { z0.s }, p0, [x11, z1.s, sxtw] +; CHECK-NEXT: b.eq .LBB16_2 +; CHECK-NEXT: // %bb.3: // %for.cond1.cleanup +; CHECK-NEXT: // in Loop: Header=BB16_1 Depth=1 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #10 +; CHECK-NEXT: b.ne .LBB16_1 +; CHECK-NEXT: // %bb.4: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.ph + +vector.ph: + %index2 = phi i64 [ 0, %entry ], [ %index2.next, %for.cond1.cleanup ] + %splatinsertIdx2 = insertelement poison, i64 %index2, i64 0 + %splatIdx2 = shufflevector %splatinsertIdx2, poison, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %.splatinsert = insertelement poison, i64 %index, i64 0 + %splatIdx = shufflevector %.splatinsert, poison, zeroinitializer + %add1 = add %splatIdx, %step + %add2 = add %add1, %splatIdx2 + %splat.const8.ins = insertelement undef, i64 8, i32 0 + %splat.const8 = shufflevector %splat.const8.ins, undef, zeroinitializer + %mul = mul %add2, %splat.const8 + %gep = getelementptr [8 x half], [8 x half]* %base, %mul + %gep.bc = bitcast %gep to + call void @llvm.masked.scatter.nxv4f16( %data, %gep.bc, i32 2, %pg) + %t14 = call i64 @llvm.vscale.i64() + %t15 = shl nuw nsw i64 %t14, 3 + %index.next = add nuw i64 %index, %t15 + %t16 = icmp eq i64 %index.next, 10 + br i1 %t16, label %vector.body, label %for.cond1.cleanup, !llvm.loop !0 + +for.cond1.cleanup: + %index2.next = add nuw nsw i64 %index2, 1 + %exitcond = icmp eq i64 %index2.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %vector.ph, !llvm.loop !0 + +for.cond.cleanup: ; preds = %middle.block, %for.body + ret void + +} + + +;; Negative test + +; stepvector is hidden further behind GEP four adds and a shift. +; Threshold is 3 +define void @neg_loop_scatter_f16_index_3add_mul([8 x half]* %base, i64 %offset, i64 %offset2, %pg, %data) #0 { +; CHECK-LABEL: neg_loop_scatter_f16_index_3add_mul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: incd z2.d +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: .LBB17_1: // %vector.ph +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB17_2 Depth 2 +; CHECK-NEXT: mov x10, xzr +; CHECK-NEXT: .LBB17_2: // %vector.body +; CHECK-NEXT: // Parent Loop BB17_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: mov z4.d, x10 +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: add z6.d, z4.d, z2.d +; CHECK-NEXT: add z4.d, z4.d, z1.d +; CHECK-NEXT: add x10, x10, x9 +; CHECK-NEXT: add z6.d, z6.d, z5.d +; CHECK-NEXT: add z4.d, z4.d, z5.d +; CHECK-NEXT: lsl z5.d, z6.d, #7 +; CHECK-NEXT: lsl z4.d, z4.d, #7 +; CHECK-NEXT: add z5.d, z5.d, #512 // =0x200 +; CHECK-NEXT: add z4.d, z4.d, #512 // =0x200 +; CHECK-NEXT: cmp x10, #10 +; CHECK-NEXT: st1h { z3.d }, p1, [x0, z4.d] +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z5.d] +; CHECK-NEXT: b.eq .LBB17_2 +; CHECK-NEXT: // %bb.3: // %for.cond1.cleanup +; CHECK-NEXT: // in Loop: Header=BB17_1 Depth=1 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #10 +; CHECK-NEXT: b.ne .LBB17_1 +; CHECK-NEXT: // %bb.4: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.ph + +vector.ph: + %index2 = phi i64 [ 0, %entry ], [ %index2.next, %for.cond1.cleanup ] + %splatinsertIdx2 = insertelement poison, i64 %index2, i64 0 + %splatIdx2 = shufflevector %splatinsertIdx2, poison, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %.splatinsert = insertelement poison, i64 %index, i64 0 + %splatIdx = shufflevector %.splatinsert, poison, zeroinitializer + %add1 = add %splatIdx, %step + %add2 = add %add1, %splatIdx2 + %splat.const4.ins = insertelement undef, i64 4 , i32 0 + %splat.const4 = shufflevector %splat.const4.ins, undef, zeroinitializer + %add3 = add %add2, %splat.const4 + %splat.const8.ins = insertelement undef, i64 8, i32 0 + %splat.const8 = shufflevector %splat.const8.ins, undef, zeroinitializer + %mul = mul %add3, %splat.const8 + %gep = getelementptr [8 x half], [8 x half]* %base, %mul + %gep.bc = bitcast %gep to + call void @llvm.masked.scatter.nxv4f16( %data, %gep.bc, i32 2, %pg) + %t14 = call i64 @llvm.vscale.i64() + %t15 = shl nuw nsw i64 %t14, 3 + %index.next = add nuw i64 %index, %t15 + %t16 = icmp eq i64 %index.next, 10 + br i1 %t16, label %vector.body, label %for.cond1.cleanup, !llvm.loop !0 + +for.cond1.cleanup: + %index2.next = add nuw nsw i64 %index2, 1 + %exitcond = icmp eq i64 %index2.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %vector.ph, !llvm.loop !0 + +for.cond.cleanup: ; preds = %middle.block, %for.body + ret void + +} + + +;; The fold does not apply for: splat + (step+splat) +;; only for : (step+splat) + splat +;; because we cannot guarantee position of splat in +;; canonicalizeCommutativeBinop +define void @neg_scatter_nxv4i8(i8* %base, i64 %offset, %pg, %data, i64 %n) #0 { +; CHECK-LABEL: neg_scatter_nxv4i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: incd z2.d +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: .LBB18_1: // %vector.ph +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB18_2 Depth 2 +; CHECK-NEXT: mov x10, xzr +; CHECK-NEXT: .LBB18_2: // %vector.body +; CHECK-NEXT: // Parent Loop BB18_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: mov z4.d, x10 +; CHECK-NEXT: add x10, x10, x9 +; CHECK-NEXT: add z6.d, z2.d, z4.d +; CHECK-NEXT: add z4.d, z1.d, z4.d +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: lsl z6.d, z6.d, #4 +; CHECK-NEXT: lsl z4.d, z4.d, #4 +; CHECK-NEXT: add z6.d, z5.d, z6.d +; CHECK-NEXT: add z4.d, z5.d, z4.d +; CHECK-NEXT: cmp x10, #10 +; CHECK-NEXT: st1b { z3.d }, p1, [x0, z4.d] +; CHECK-NEXT: st1b { z0.d }, p0, [x0, z6.d] +; CHECK-NEXT: b.eq .LBB18_2 +; CHECK-NEXT: // %bb.3: // %for.cond1.cleanup +; CHECK-NEXT: // in Loop: Header=BB18_1 Depth=1 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #10 +; CHECK-NEXT: b.ne .LBB18_1 +; CHECK-NEXT: // %bb.4: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.ph + +vector.ph: + %index2 = phi i64 [ 0, %entry ], [ %index2.next, %for.cond1.cleanup ] + %splatinsertIdx2 = insertelement poison, i64 %index2, i64 0 + %splatIdx2 = shufflevector %splatinsertIdx2, poison, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %.splatinsert = insertelement poison, i64 %index, i64 0 + %splatIdx = shufflevector %.splatinsert, poison, zeroinitializer + %splatinsert4 = insertelement poison, i64 4, i64 0 + %splat4 = shufflevector %splatinsert4, poison, zeroinitializer + %t0.0 = add %splatIdx, %step + %t0.1 = shl %t0.0, %splat4 + %t0.2 = add nuw nsw %splatIdx2 , %t0.1 + %t0.3 = getelementptr i8, i8* %base, %t0.2 + call void @llvm.masked.scatter.nxv4i8( %data, %t0.3, i32 2, %pg) + %t14 = call i64 @llvm.vscale.i64() + %t15 = shl nuw nsw i64 %t14, 3 + %index.next = add nuw i64 %index, %t15 + %t16 = icmp eq i64 %index.next, 10 + br i1 %t16, label %vector.body, label %for.cond1.cleanup, !llvm.loop !0 + +for.cond1.cleanup: + %index2.next = add nuw nsw i64 %index2, 1 + %exitcond = icmp eq i64 %index2.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %vector.ph, !llvm.loop !0 + +for.cond.cleanup: ; preds = %middle.block, %for.body + ret void +} + +; Index is removed. +; "vscale x 4" wide scatter when the offset is too big. +define void @neg_scatter_nxv4i8_noIndex(i8* %base, i64 %offset, %pg, %data, i64 %n) #0 { +; CHECK-LABEL: neg_scatter_nxv4i8_noIndex: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: .LBB19_1: // %vector.ph +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB19_2 Depth 2 +; CHECK-NEXT: mov x10, xzr +; CHECK-NEXT: .LBB19_2: // %vector.body +; CHECK-NEXT: // Parent Loop BB19_1 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: add x10, x10, x9 +; CHECK-NEXT: st1b { z1.d }, p1, [x0, z0.d] +; CHECK-NEXT: cmp x10, #10 +; CHECK-NEXT: st1b { z0.d }, p0, [x0, z0.d] +; CHECK-NEXT: b.eq .LBB19_2 +; CHECK-NEXT: // %bb.3: // %for.cond1.cleanup +; CHECK-NEXT: // in Loop: Header=BB19_1 Depth=1 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #10 +; CHECK-NEXT: b.ne .LBB19_1 +; CHECK-NEXT: // %bb.4: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + br label %vector.ph + +vector.ph: + %index2 = phi i64 [ 0, %entry ], [ %index2.next, %for.cond1.cleanup ] + %splatinsertIdx2 = insertelement poison, i64 %index2, i64 0 + %splatIdx2 = shufflevector %splatinsertIdx2, poison, zeroinitializer + %step = call @llvm.experimental.stepvector.nxv4i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %.splatinsert = insertelement poison, i64 %index, i64 0 + %splatIdx = shufflevector %.splatinsert, poison, zeroinitializer + %splatinsert4 = insertelement poison, i64 33554432, i64 0 + %splat4 = shufflevector %splatinsert4, poison, zeroinitializer + %t0.0 = add %splatIdx, %step + %t0.1 = shl %t0.0, %splat4 + %t0.2 = add nuw nsw %t0.1, %splatIdx2 + %t0.3 = getelementptr i8, i8* %base, %t0.2 + call void @llvm.masked.scatter.nxv4i8( %data, %t0.3, i32 2, %pg) + %t14 = call i64 @llvm.vscale.i64() + %t15 = shl nuw nsw i64 %t14, 3 + %index.next = add nuw i64 %index, %t15 + %t16 = icmp eq i64 %index.next, 10 + br i1 %t16, label %vector.body, label %for.cond1.cleanup, !llvm.loop !0 + +for.cond1.cleanup: + %index2.next = add nuw nsw i64 %index2, 1 + %exitcond = icmp eq i64 %index2.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %vector.ph, !llvm.loop !0 + +for.cond.cleanup: ; preds = %middle.block, %for.body + ret void +} + + attributes #0 = { "target-features"="+sve" vscale_range(1, 16) } +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.isvectorized", i32 1} +!2 = distinct !{!2, !3, !1} +!3 = !{!"llvm.loop.unroll.runtime.disable"} +declare i64 @llvm.vscale.i64() declare @llvm.masked.gather.nxv4f32(, i32, , ) declare @llvm.masked.gather.nxv4i8(, i32, , ) @@ -340,3 +859,4 @@ declare void @llvm.masked.scatter.nxv4i16(, , i32, ) declare void @llvm.masked.scatter.nxv4f16(, , i32, ) declare @llvm.experimental.stepvector.nxv4i64() +