Index: llvm/test/CodeGen/AArch64/sve-int-arith.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-int-arith.ll +++ llvm/test/CodeGen/AArch64/sve-int-arith.ll @@ -744,6 +744,83 @@ ret %2 } +define @multiple_fused_ops( %a, %b) +; CHECK-LABEL: multiple_fused_ops: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: movprfx z2, z0 +; CHECK-NEXT: mul z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: add z2.h, z2.h, #200 // =0xc8 +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: ret +{ + %1 = mul %a, %b + %2 = add %1, shufflevector ( insertelement ( poison, i16 200, i16 0), poison, zeroinitializer) + %3 = mul %2, %a + %4 = sub %3, %b + ret %4 +} + +define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) { +; CHECK-LABEL: mad_in_loop: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB70_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cntw x10 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: whilelo p1.s, xzr, x9 +; CHECK-NEXT: .LBB70_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x2, x8, lsl #2] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, #1 // =0x1 +; CHECK-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.s, x8, x9 +; CHECK-NEXT: b.mi .LBB70_2 +; CHECK-NEXT: .LBB70_3: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 2 + br label %vector.body + +vector.body: ; preds = %vector.body, %for.body.preheader + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %2 = getelementptr inbounds i32, ptr %src1, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %2, i32 4, %active.lane.mask, poison) + %3 = getelementptr inbounds i32, ptr %src2, i64 %index + %wide.masked.load12 = tail call @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, %active.lane.mask, poison) + %4 = mul nsw %wide.masked.load12, %wide.masked.load + %5 = add nsw %4, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) + %6 = getelementptr inbounds i32, ptr %dst, i64 %index + tail call void @llvm.masked.store.nxv4i32.p0( %5, ptr %6, i32 4, %active.lane.mask) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) + %7 = extractelement %active.lane.mask.next, i64 0 + br i1 %7, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +declare i64 @llvm.vscale.i64() +declare @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) +declare @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, , ) +declare void @llvm.masked.store.nxv4i32.p0(, ptr nocapture, i32 immarg, ) + declare @llvm.sadd.sat.nxv16i8(, ) declare @llvm.sadd.sat.nxv8i16(, ) declare @llvm.sadd.sat.nxv4i32(, )