diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12347,6 +12347,9 @@ "Unsupported opcode."); SDLoc DL(N); EVT VT = N->getValueType(0); + if (VT == MVT::nxv8bf16 && + !static_cast(DAG.getSubtarget()).hasBF16()) + return SDValue(); EVT LoadVT = VT; if (VT.isFloatingPoint()) diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -495,6 +495,9 @@ def SImmS16XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64); }]>; +def SImmS32XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 32, SDLoc(N), MVT::i64); +}]>; // simm6sN predicate - True if the immediate is a multiple of N in the range // [-32 * N, 31 * N]. @@ -546,7 +549,7 @@ let DecoderMethod = "DecodeSImm<4>"; } def simm4s32 : Operand, ImmLeaf=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> { +[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }], SImmS32XForm> { let PrintMethod = "printImmScale<32>"; let ParserMatchClass = SImm4s32Operand; let DecoderMethod = "DecodeSImm<4>"; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7728,9 +7728,13 @@ (!cast(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>; // Base addressing mode - def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)), - (!cast(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>; - + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), GPR64sp:$base)), + (!cast(NAME) PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>; + let AddedComplexity = 2 in { + // Reg + Imm addressing mode + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), (add GPR64:$base, (i64 simm4s32:$imm)))), + (!cast(NAME) $Pg, $base, simm4s32:$imm)>; + } } class sve_mem_ldor_ss sz, string asm, RegisterOperand VecList, diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll @@ -0,0 +1,174 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+f64mm -asm-verbose=0 < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; WARN-NOT: warning + +; +; LD1ROB +; + +define @ld1rob_i8( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8: +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, #32] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +; +; LD1ROH +; + +define @ld1roh_i16( %pg, i16* %a) nounwind { +; CHECK-LABEL: ld1roh_i16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %base = getelementptr i16, i16* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv8i16( %pg, i16* %base) + ret %load +} + +define @ld1roh_f16( %pg, half* %a) nounwind { +; CHECK-LABEL: ld1roh_f16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %base = getelementptr half, half* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv8f16( %pg, half* %base) + ret %load +} + +define @ld1roh_bf16( %pg, bfloat* %a) nounwind #0 { +; CHECK-LABEL: ld1roh_bf16: +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64] +; CHECK-NEXT: ret + %base = getelementptr bfloat, bfloat* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv8bf16( %pg, bfloat* %base) + ret %load +} + +; +; LD1ROW +; + +define @ld1row_i32( %pg, i32* %a) nounwind { +; CHECK-LABEL: ld1row_i32: +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0, #128] +; CHECK-NEXT: ret + %base = getelementptr i32, i32* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv4i32( %pg, i32* %base) + ret %load +} + +define @ld1row_f32( %pg, float* %a) nounwind { +; CHECK-LABEL: ld1row_f32: +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0, #128] +; CHECK-NEXT: ret + %base = getelementptr float, float* %a, i64 32 + %load = call @llvm.aarch64.sve.ld1ro.nxv4f32( %pg, float* %base) + ret %load +} + +; +; LD1ROD +; + +define @ld1rod_i64( %pg, i64* %a) nounwind { +; CHECK-LABEL: ld1rod_i64: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #-64] +; CHECK-NEXT: ret + %base = getelementptr i64, i64* %a, i64 -8 + %load = call @llvm.aarch64.sve.ld1ro.nxv2i64( %pg, i64* %base) + ret %load +} + +define @ld1rod_f64( %pg, double* %a) nounwind { +; CHECK-LABEL: ld1rod_f64: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #-128] +; CHECK-NEXT: ret + %base = getelementptr double, double* %a, i64 -16 + %load = call @llvm.aarch64.sve.ld1ro.nxv2f64( %pg, double* %base) + ret %load +} + + +;;;;;;;;;;;;;; +; range checks: immediate must be a multiple of 32 in the range -256, ..., 224 + +; lower bound +define @ld1rob_i8_lower_bound( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8_lower_bound: +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, #-256] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 -256 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +; below lower bound +define @ld1roh_i16_below_lower_bound( %pg, i16* %a) nounwind { +; CHECK-LABEL: ld1roh_i16_below_lower_bound: +; CHECK-NEXT: sub x[[BASE:[0-9]+]], x0, #258 +; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x[[BASE]]] +; CHECK-NEXT: ret + %base = getelementptr i16, i16* %a, i64 -129 + %load = call @llvm.aarch64.sve.ld1ro.nxv8i16( %pg, i16* %base) + ret %load +} + +define @ld1rob_i8_below_lower_bound_01( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8_below_lower_bound_01: +; CHECK-NEXT: mov x[[OFFSET:[0-9]+]], #-257 +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 -257 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +; not a multiple of 32 +define @ld1row_i32_not_multiple( %pg, i32* %a) nounwind { +; CHECK-LABEL: ld1row_i32_not_multiple: +; CHECK-NEXT: add x[[BASE:[0-9]+]], x0, #12 +; CHECK-NEXT: ld1row { z0.s }, p0/z, [x[[BASE]]] +; CHECK-NEXT: ret + %base = getelementptr i32, i32* %a, i64 3 + %load = call @llvm.aarch64.sve.ld1ro.nxv4i32( %pg, i32* %base) + ret %load +} + +; upper bound +define @ld1rod_i64_upper_bound( %pg, i64* %a) nounwind { +; CHECK-LABEL: ld1rod_i64_upper_bound: +; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #224] +; CHECK-NEXT: ret + %base = getelementptr i64, i64* %a, i64 28 + %load = call @llvm.aarch64.sve.ld1ro.nxv2i64( %pg, i64* %base) + ret %load +} + +define @ld1rob_i8_beyond_upper_bound( %pg, i8* %a) nounwind { +; CHECK-LABEL: ld1rob_i8_beyond_upper_bound: +; CHECK-NEXT: mov w[[OFFSET:[0-9]+]], #225 +; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, x[[OFFSET]]] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %a, i64 225 + %load = call @llvm.aarch64.sve.ld1ro.nxv16i8( %pg, i8* %base) + ret %load +} + +declare @llvm.aarch64.sve.ld1ro.nxv16i8(, i8*) + +declare @llvm.aarch64.sve.ld1ro.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1ro.nxv8f16(, half*) +declare @llvm.aarch64.sve.ld1ro.nxv8bf16(, bfloat*) + +declare @llvm.aarch64.sve.ld1ro.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ld1ro.nxv4f32(, float*) + +declare @llvm.aarch64.sve.ld1ro.nxv2i64(, i64*) +declare @llvm.aarch64.sve.ld1ro.nxv2f64(, double*) + + +; +bf16 is required for the bfloat version. +attributes #0 = { "target-features"="+sve,+f64mm,+bf16" }