diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12645,9 +12645,6 @@ /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2). bool AArch64TargetLowering::shouldSinkOperands( Instruction *I, SmallVectorImpl &Ops) const { - if (!I->getType()->isVectorTy()) - return false; - if (IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { case Intrinsic::aarch64_neon_smull: @@ -12660,7 +12657,8 @@ LLVM_FALLTHROUGH; case Intrinsic::fma: - if (cast(I->getType())->getElementType()->isHalfTy() && + if (isa(I->getType()) && + cast(I->getType())->getElementType()->isHalfTy() && !Subtarget->hasFullFP16()) return false; LLVM_FALLTHROUGH; @@ -12673,7 +12671,46 @@ if (isSplatShuffle(II->getOperand(1))) Ops.push_back(&II->getOperandUse(1)); return !Ops.empty(); - + case Intrinsic::aarch64_sme_write_horiz: + case Intrinsic::aarch64_sme_write_vert: + case Intrinsic::aarch64_sme_writeq_horiz: + case Intrinsic::aarch64_sme_writeq_vert: { + auto *Idx = dyn_cast(II->getOperand(1)); + if (!Idx || Idx->getOpcode() != Instruction::Add) + return false; + Ops.push_back(&II->getOperandUse(1)); + return true; + } + case Intrinsic::aarch64_sme_read_horiz: + case Intrinsic::aarch64_sme_read_vert: + case Intrinsic::aarch64_sme_readq_horiz: + case Intrinsic::aarch64_sme_readq_vert: + case Intrinsic::aarch64_sme_ld1b_vert: + case Intrinsic::aarch64_sme_ld1h_vert: + case Intrinsic::aarch64_sme_ld1w_vert: + case Intrinsic::aarch64_sme_ld1d_vert: + case Intrinsic::aarch64_sme_ld1q_vert: + case Intrinsic::aarch64_sme_st1b_vert: + case Intrinsic::aarch64_sme_st1h_vert: + case Intrinsic::aarch64_sme_st1w_vert: + case Intrinsic::aarch64_sme_st1d_vert: + case Intrinsic::aarch64_sme_st1q_vert: + case Intrinsic::aarch64_sme_ld1b_horiz: + case Intrinsic::aarch64_sme_ld1h_horiz: + case Intrinsic::aarch64_sme_ld1w_horiz: + case Intrinsic::aarch64_sme_ld1d_horiz: + case Intrinsic::aarch64_sme_ld1q_horiz: + case Intrinsic::aarch64_sme_st1b_horiz: + case Intrinsic::aarch64_sme_st1h_horiz: + case Intrinsic::aarch64_sme_st1w_horiz: + case Intrinsic::aarch64_sme_st1d_horiz: + case Intrinsic::aarch64_sme_st1q_horiz: { + auto *Idx = dyn_cast(II->getOperand(3)); + if (!Idx || Idx->getOpcode() != Instruction::Add) + return false; + Ops.push_back(&II->getOperandUse(3)); + return true; + } case Intrinsic::aarch64_neon_pmull: if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) return false; @@ -12692,6 +12729,9 @@ } } + if (!I->getType()->isVectorTy()) + return false; + switch (I->getOpcode()) { case Instruction::Sub: case Instruction::Add: { diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -295,6 +295,40 @@ ret void; } +; Ensure that the tile offset is sunk, given that this is likely to be an 'add' +; that's decomposed into a base + offset in ISel. +define void @test_ld1_sink_tile0_offset_operand( %pg, ptr %src, i32 %base, i32 %N) { +; CHECK-LABEL: test_ld1_sink_tile0_offset_operand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: .LBB14_1: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: subs w2, w2, #1 +; CHECK-NEXT: ld1w {za0h.s[w12, 1]}, p0/z, [x0] +; CHECK-NEXT: ld1w {za0h.s[w12, 2]}, p0/z, [x0] +; CHECK-NEXT: b.ne .LBB14_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: ret +entry: + %add1 = add i32 %base, 1 + %add2 = add i32 %base, 2 + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %src, i64 0, i32 %base) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %src, i64 0, i32 %add1) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, ptr %src, i64 0, i32 %add2) + %inc = add nuw nsw i32 %i, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + + declare void @llvm.aarch64.sme.ld1b.horiz(, ptr, i64, i32) declare void @llvm.aarch64.sme.ld1h.horiz(, ptr, i64, i32) declare void @llvm.aarch64.sme.ld1w.horiz(, ptr, i64, i32) diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -verify-machineinstrs < %s | FileCheck %s define @extract_row_b( %zd, %pg, i32 %tileslice) { ; CHECK-LABEL: extract_row_b: @@ -435,6 +435,45 @@ ret %res } +define @test_sink_offset_operand( %pg, i32 %base, i32 %N) { +; CHECK-LABEL: test_sink_offset_operand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: .LBB26_1: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0] +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z2.s, p0/m, za0h.s[w12, 1] +; CHECK-NEXT: subs w1, w1, #3 +; CHECK-NEXT: mov z3.s, p0/m, za0h.s[w12, 2] +; CHECK-NEXT: b.ne .LBB26_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: add z0.s, z1.s, z2.s +; CHECK-NEXT: add z0.s, z0.s, z3.s +; CHECK-NEXT: ret +entry: + %add1 = add i32 %base, 1 + %add2 = add i32 %base, 2 + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %z0 = call @llvm.aarch64.sme.read.horiz.nxv4i32( zeroinitializer, %pg, i64 0, i32 %base) + %z1 = call @llvm.aarch64.sme.read.horiz.nxv4i32( zeroinitializer, %pg, i64 0, i32 %add1) + %z2 = call @llvm.aarch64.sme.read.horiz.nxv4i32( zeroinitializer, %pg, i64 0, i32 %add2) + %inc = add nuw nsw i32 %i, 3 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: + %tmp1 = add %z0, %z1 + %res = add %tmp1, %z2 + ret %res +} + declare @llvm.aarch64.sme.read.horiz.nxv16i8(, , i64, i32) declare @llvm.aarch64.sme.read.horiz.nxv8i16(, , i64, i32) declare @llvm.aarch64.sme.read.horiz.nxv8f16(, , i64, i32) diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-insert.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-insert.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-insert.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-insert.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -verify-machineinstrs < %s | FileCheck %s define void @insert_row_b(i32 %tileslice, %pg, ; CHECK-LABEL: insert_row_b: @@ -438,6 +438,37 @@ ret void } +define void @test_sink_offset_operand( %pg, i32 %base, i32 %N) { +; CHECK-LABEL: test_sink_offset_operand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: .LBB28_1: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: subs w1, w1, #3 +; CHECK-NEXT: mov za0h.s[w12, 0], p0/m, z0.s +; CHECK-NEXT: mov za0h.s[w12, 1], p0/m, z0.s +; CHECK-NEXT: mov za0h.s[w12, 2], p0/m, z0.s +; CHECK-NEXT: b.ne .LBB28_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: ret +entry: + %add1 = add i32 %base, 1 + %add2 = add i32 %base, 2 + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + call void @llvm.aarch64.sme.write.horiz.nxv4i32(i64 0, i32 %base, %pg, zeroinitializer) + call void @llvm.aarch64.sme.write.horiz.nxv4i32(i64 0, i32 %add1, %pg, zeroinitializer) + call void @llvm.aarch64.sme.write.horiz.nxv4i32(i64 0, i32 %add2, %pg, zeroinitializer) + %inc = add nuw nsw i32 %i, 3 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} declare void @llvm.aarch64.sme.write.horiz.nxv16i8(i64, i32, , ) declare void @llvm.aarch64.sme.write.horiz.nxv8i16(i64, i32, , ) diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -295,6 +295,39 @@ ret void; } +; Ensure that the tile offset is sunk, given that this is likely to be an 'add' +; that's decomposed into a base + offset in ISel. +define void @test_sink_tile0_offset_operand( %pg, ptr %src, i32 %base, i32 %N) { +; CHECK-LABEL: test_sink_tile0_offset_operand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: .LBB14_1: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: st1w {za0h.s[w12, 0]}, p0, [x0] +; CHECK-NEXT: subs w2, w2, #1 +; CHECK-NEXT: st1w {za0h.s[w12, 1]}, p0, [x0] +; CHECK-NEXT: st1w {za0h.s[w12, 2]}, p0, [x0] +; CHECK-NEXT: b.ne .LBB14_1 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: ret +entry: + %add0 = add i32 %base, 1 + %add1 = add i32 %base, 2 + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + tail call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %src, i64 0, i32 %base) + tail call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %src, i64 0, i32 %add0) + tail call void @llvm.aarch64.sme.st1w.horiz( %pg, ptr %src, i64 0, i32 %add1) + %inc = add nuw nsw i32 %i, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + declare void @llvm.aarch64.sme.st1b.horiz(, ptr, i64, i32) declare void @llvm.aarch64.sme.st1h.horiz(, ptr, i64, i32) declare void @llvm.aarch64.sme.st1w.horiz(, ptr, i64, i32)