diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -221,6 +221,15 @@ void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); + /// SVE Reg+Imm addressing mode. + template + bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base, + SDValue &OffImm); + /// SVE Reg+Reg address mode. + template + bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) { + return SelectSVERegRegAddrMode(N, Scale, Base, Offset); + } void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); @@ -280,6 +289,8 @@ bool SelectSVESignedArithImm(SDValue N, SDValue &Imm); bool SelectSVEArithImm(SDValue N, SDValue &Imm); + bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, + SDValue &Offset); }; } // end anonymous namespace @@ -4399,3 +4410,76 @@ CodeGenOpt::Level OptLevel) { return new AArch64DAGToDAGISel(TM, OptLevel); } + +/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode: +/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max +/// where Root is the memory access using N for its address. +template +bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, + SDValue &Base, + SDValue &OffImm) { + assert(isa(Root) && "Invalid node."); + + EVT MemVT = cast(Root)->getMemoryVT(); + + if (N.getOpcode() != ISD::ADD) + return false; + + SDValue VScale = N.getOperand(1); + if (VScale.getOpcode() != ISD::VSCALE) + return false; + + TypeSize TS = MemVT.getSizeInBits(); + int64_t MemWidthBytes = static_cast(TS.getKnownMinSize()) / 8; + int64_t MulImm = cast(VScale.getOperand(0))->getSExtValue(); + + if ((MulImm % MemWidthBytes) != 0) + return false; + + int64_t Offset = MulImm / MemWidthBytes; + if (Offset < Min || Offset > Max) + return false; + + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64); + return true; +} + +/// Select register plus register addressing mode for SVE, with scaled +/// offset. +bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale, + SDValue &Base, + SDValue &Offset) { + if (N.getOpcode() != ISD::ADD) + return false; + + // Process an ADD node. + const SDValue LHS = N.getOperand(0); + const SDValue RHS = N.getOperand(1); + + // 8 bit data does not come with the SHL node, so it is treated + // separately. + if (Scale == 0) { + Base = LHS; + Offset = RHS; + return true; + } + + // Check if the RHS is a shift node with a constant. + if (RHS.getOpcode() != ISD::SHL) + return false; + + const SDValue ShiftRHS = RHS.getOperand(1); + auto *C = dyn_cast(ShiftRHS); + if (nullptr == C) + return false; + + const uint64_t ShiftAmount = C->getZExtValue(); + if (ShiftAmount == Scale) { + Base = LHS; + Offset = RHS.getOperand(0); + return true; + } + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1209,81 +1209,100 @@ // Add more complex addressing modes here as required multiclass pred_load { - + Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))), (RegImmInst PPR:$gp, GPR64:$base, (i64 0))>; } // 2-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 4-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 8-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 16-element contiguous loads - defm : pred_load; + defm : pred_load; multiclass pred_store { + Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)), + (RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)), (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; } // 2-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 4-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 8-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 16-element contiguous stores - defm : pred_store; + defm : pred_store; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; multiclass unpred_store { def _fi : Pat<(store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7032,3 +7032,11 @@ let Constraints = "$Zdn = $_Zdn"; } + +/// Addressing modes +def am_sve_indexed_s4 :ComplexPattern", [], [SDNPWantRoot]>; + +def am_sve_regreg_lsl0 : ComplexPattern", []>; +def am_sve_regreg_lsl1 : ComplexPattern", []>; +def am_sve_regreg_lsl2 : ComplexPattern", []>; +def am_sve_regreg_lsl3 : ComplexPattern", []>; diff --git a/llvm/test/CodeGen/AArch64/sve-gep.ll b/llvm/test/CodeGen/AArch64/sve-gep.ll --- a/llvm/test/CodeGen/AArch64/sve-gep.ll +++ b/llvm/test/CodeGen/AArch64/sve-gep.ll @@ -4,8 +4,8 @@ define * @scalar_of_scalable_1(* %base) { ; CHECK-LABEL: scalar_of_scalable_1: ; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: add x0, x0, x8, lsl #2 +; CHECK-NEXT: rdvl x8, #4 +; CHECK-NEXT: add x0, x0, x8 ; CHECK-NEXT: ret %d = getelementptr , * %base, i64 4 ret * %d diff --git a/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll @@ -0,0 +1,622 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; Range checks: for all the instruction tested in this file, the +; immediate must be within the range [-8, 7] (4-bit immediate). Out of +; range values are tested only in one case (following). Valid values +; are tested all through the rest of the file. + +define void @imm_out_of_range( * %base, %mask) nounwind { +; CHECK-LABEL: imm_out_of_range: +; CHECK-NEXT: rdvl x8, #8 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}] +; CHECK-NEXT: rdvl x8, #-9 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x{{[0-9]+}}] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 8 + %data = call @llvm.masked.load.nxv2i64(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 -9 + call void @llvm.masked.store.nxv2i64( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +; 2-lane contiguous load/stores + +define void @test_masked_ldst_sv2i8( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2i8(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2i8( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv2i16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i16: +; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2i16(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2i16( %data, + * %base_store, + i32 1, + %mask) + ret void +} + + +define void @test_masked_ldst_sv2i32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i32: +; CHECK-NEXT: ld1sw { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2i32(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2i32( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv2i64( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2i64(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2i64( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv2f16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2f16(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2f16( %data, + * %base_store, + i32 1, + %mask) + ret void +} + + +define void @test_masked_ldst_sv2f32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2f32(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2f32( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv2f64( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl] +; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, #-5, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -6 + %data = call @llvm.masked.load.nxv2f64(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 -5 + call void @llvm.masked.store.nxv2f64( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +; 2-lane zero/sign extended contiguous loads. + +define @masked_zload_sv2i8_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -4 + %load = call @llvm.masked.load.nxv2i8(* %base_load, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i8_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -3 + %load = call @llvm.masked.load.nxv2i8(* %base_load, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +define @masked_zload_sv2i16_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 1 + %load = call @llvm.masked.load.nxv2i16(* %base_load, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i16_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 2 + %load = call @llvm.masked.load.nxv2i16(* %base_load, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +define @masked_zload_sv2i32_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -2 + %load = call @llvm.masked.load.nxv2i32(* %base_load, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i32_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %load = call @llvm.masked.load.nxv2i32(* %base_load, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +; 2-lane truncating contiguous stores. + +define void @masked_trunc_store_sv2i64_to_sv2i8( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: +; CHECK-NEXT: st1b { z0.d }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 3 + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i8( %trunc, + *%base_load, + i32 8, + %mask) + ret void +} + + +define void @masked_trunc_store_sv2i64_to_sv2i16( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, #4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 4 + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i16( %trunc, + *%base_load, + i32 8, + %mask) + ret void +} + +define void @masked_trunc_store_sv2i64_to_sv2i32( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: +; CHECK-NEXT: st1w { z0.d }, p0, [x0, #5, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 5 + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i32( %trunc, + *%base_load, + i32 8, + %mask) + ret void +} + +; 4-lane contiguous load/stores. + +define void @test_masked_ldst_sv4i8( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.masked.load.nxv4i8(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.masked.store.nxv4i8( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv4i16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i16: +; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.masked.load.nxv4i16(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.masked.store.nxv4i16( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv4i32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.masked.load.nxv4i32(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.masked.store.nxv4i32( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv4f16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.masked.load.nxv4f16(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.masked.store.nxv4f16( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv4f32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.masked.load.nxv4f32(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.masked.store.nxv4f32( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +; 4-lane zero/sign extended contiguous loads. + +define @masked_zload_sv4i8_to_sv4i32(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -4 + %load = call @llvm.masked.load.nxv4i8(* %base_load, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv4i8_to_sv4i32(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -3 + %load = call @llvm.masked.load.nxv4i8(* %base_load, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +define @masked_zload_sv4i16_to_sv4i32(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 1 + %load = call @llvm.masked.load.nxv4i16(* %base_load, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv4i16_to_sv4i32(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 2 + %load = call @llvm.masked.load.nxv4i16(* %base_load, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +; 4-lane truncating contiguous stores. + +define void @masked_trunc_store_sv4i32_to_sv4i8( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: +; CHECK-NEXT: st1b { z0.s }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 3 + %trunc = trunc %val to + call void @llvm.masked.store.nxv4i8( %trunc, + *%base_load, + i32 8, + %mask) + ret void +} + + +define void @masked_trunc_store_sv4i32_to_sv4i16( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: +; CHECK-NEXT: st1h { z0.s }, p0, [x0, #4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 4 + %trunc = trunc %val to + call void @llvm.masked.store.nxv4i16( %trunc, + *%base_load, + i32 8, + %mask) + ret void +} + +; 8-lane contiguous load/stores. + +define void @test_masked_ldst_sv8i8( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.masked.load.nxv8i8(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.masked.store.nxv8i8( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv8i16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.masked.load.nxv8i16(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.masked.store.nxv8i16( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv8f16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.masked.load.nxv8f16(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.masked.store.nxv8f16( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +; 8-lane zero/sign extended contiguous loads. + +define @masked_zload_sv8i8_to_sv8i16(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -4 + %load = call @llvm.masked.load.nxv8i8(* %base_load, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv8i8_to_sv8i16(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -3 + %load = call @llvm.masked.load.nxv8i8(* %base_load, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +; 8-lane truncating contiguous stores. + +define void @masked_trunc_store_sv8i16_to_sv8i8( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: +; CHECK-NEXT: st1b { z0.h }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 3 + %trunc = trunc %val to + call void @llvm.masked.store.nxv8i8( %trunc, + *%base_load, + i32 8, + %mask) + ret void +} + +; 16-lane contiguous load/stores. + +define void @test_masked_ldst_sv16i8( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.masked.load.nxv16i8(* %base_load, + i32 1, + %mask, + undef) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.masked.store.nxv16i8( %data, + * %base_store, + i32 1, + %mask) + ret void +} + +; 2-element contiguous loads. +declare @llvm.masked.load.nxv2i8 (* , i32, , ) +declare @llvm.masked.load.nxv2i16(*, i32, , ) +declare @llvm.masked.load.nxv2i32(*, i32, , ) +declare @llvm.masked.load.nxv2i64(*, i32, , ) +declare @llvm.masked.load.nxv2f16(*, i32, , ) +declare @llvm.masked.load.nxv2f32(*, i32, , ) +declare @llvm.masked.load.nxv2f64(*, i32, , ) + +; 4-element contiguous loads. +declare @llvm.masked.load.nxv4i8 (* , i32, , ) +declare @llvm.masked.load.nxv4i16(*, i32, , ) +declare @llvm.masked.load.nxv4i32(*, i32, , ) +declare @llvm.masked.load.nxv4f16(*, i32, , ) +declare @llvm.masked.load.nxv4f32(*, i32, , ) + +; 8-element contiguous loads. +declare @llvm.masked.load.nxv8i8 (* , i32, , ) +declare @llvm.masked.load.nxv8i16(*, i32, , ) +declare @llvm.masked.load.nxv8f16(*, i32, , ) + +; 16-element contiguous loads. +declare @llvm.masked.load.nxv16i8(*, i32, , ) + +; 2-element contiguous stores. +declare void @llvm.masked.store.nxv2i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv2i16(, *, i32, ) +declare void @llvm.masked.store.nxv2i32(, *, i32, ) +declare void @llvm.masked.store.nxv2i64(, *, i32, ) +declare void @llvm.masked.store.nxv2f16(, *, i32, ) +declare void @llvm.masked.store.nxv2f32(, *, i32, ) +declare void @llvm.masked.store.nxv2f64(, *, i32, ) + +; 4-element contiguous stores. +declare void @llvm.masked.store.nxv4i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv4i16(, *, i32, ) +declare void @llvm.masked.store.nxv4i32(, *, i32, ) +declare void @llvm.masked.store.nxv4f16(, *, i32, ) +declare void @llvm.masked.store.nxv4f32(, *, i32, ) + +; 8-element contiguous stores. +declare void @llvm.masked.store.nxv8i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv8i16(, *, i32, ) +declare void @llvm.masked.store.nxv8f16(, *, i32, ) + +; 16-element contiguous stores. +declare void @llvm.masked.store.nxv16i8(, *, i32, ) diff --git a/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll @@ -0,0 +1,610 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; 2-lane contiguous load/stores + +define void @test_masked_ldst_sv2i8(i8 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1] +; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %data = call @llvm.masked.load.nxv2i8(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv2i8( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv2i16(i16 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i16: +; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %data = call @llvm.masked.load.nxv2i16(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv2i16( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv2i32(i32 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i32: +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %data = call @llvm.masked.load.nxv2i32(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv2i32( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv2i64(i64 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %base_i64 = getelementptr i64, i64* %base, i64 %offset + %base_addr = bitcast i64* %base_i64 to * + %data = call @llvm.masked.load.nxv2i64(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv2i64( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv2f16(half * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_half = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_half to * + %data = call @llvm.masked.load.nxv2f16(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv2f16( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv2f32(float * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_float = getelementptr float, float* %base, i64 %offset + %base_addr = bitcast float* %base_float to * + %data = call @llvm.masked.load.nxv2f32(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv2f32( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv2f64(double * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %base_double = getelementptr double, double* %base, i64 %offset + %base_addr = bitcast double* %base_double to * + %data = call @llvm.masked.load.nxv2f64(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv2f64( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +; 2-lane zero/sign extended contiguous loads. + +define @masked_zload_sv2i8_to_sv2i64(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv2i8(* %base_addr, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i8_to_sv2i64(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv2i8(* %base_addr, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +define @masked_zload_sv2i16_to_sv2i64(i16* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %load = call @llvm.masked.load.nxv2i16(* %base_addr, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i16_to_sv2i64(i16* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %load = call @llvm.masked.load.nxv2i16(* %base_addr, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + + +define @masked_zload_sv2i32_to_sv2i64(i32* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %load = call @llvm.masked.load.nxv2i32(* %base_addr, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i32_to_sv2i64(i32* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %load = call @llvm.masked.load.nxv2i32(* %base_addr, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +; 2-lane truncating contiguous stores. + +define void @masked_trunc_store_sv2i64_to_sv2i8( %val, i8 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: +; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i8( %trunc, + *%base_addr, + i32 8, + %mask) + ret void +} + +define void @masked_trunc_store_sv2i64_to_sv2i16( %val, i16 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i16( %trunc, + *%base_addr, + i32 8, + %mask) + ret void +} + +define void @masked_trunc_store_sv2i64_to_sv2i32( %val, i32 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: +; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i32( %trunc, + *%base_addr, + i32 8, + %mask) + ret void +} + +; 4-lane contiguous load/stores. + +define void @test_masked_ldst_sv4i8(i8 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1] +; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %data = call @llvm.masked.load.nxv4i8(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv4i8( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv4i16(i16 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i16: +; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %data = call @llvm.masked.load.nxv4i16(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv4i16( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv4i32(i32 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %data = call @llvm.masked.load.nxv4i32(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv4i32( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv4f16(half * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_f16 = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_f16 to * + %data = call @llvm.masked.load.nxv4f16(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv4f16( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv4f32(float * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_f32 = getelementptr float, float* %base, i64 %offset + %base_addr = bitcast float* %base_f32 to * + %data = call @llvm.masked.load.nxv4f32(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv4f32( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +; 4-lane zero/sign extended contiguous loads. + +define @masked_zload_sv4i8_to_sv4i32(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv4i8(* %base_addr, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv4i8_to_sv4i32(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv4i8(* %base_addr, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +define @masked_zload_sv4i16_to_sv4i32(i16* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %load = call @llvm.masked.load.nxv4i16(* %base_addr, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv4i16_to_sv4i32(i16* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %load = call @llvm.masked.load.nxv4i16(* %base_addr, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +; 4-lane truncating contiguous stores. + +define void @masked_trunc_store_sv4i32_to_sv4i8( %val, i8 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: +; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv4i8( %trunc, + *%base_addr, + i32 8, + %mask) + ret void +} + +define void @masked_trunc_store_sv4i32_to_sv4i16( %val, i16 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: +; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv4i16( %trunc, + *%base_addr, + i32 8, + %mask) + ret void +} + +; 8-lane contiguous load/stores. + +define void @test_masked_ldst_sv8i8(i8 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1] +; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %data = call @llvm.masked.load.nxv8i8(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv8i8( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv8i16(i16 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %data = call @llvm.masked.load.nxv8i16(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv8i16( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +define void @test_masked_ldst_sv8f16(half * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_f16 = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_f16 to * + %data = call @llvm.masked.load.nxv8f16(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv8f16( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +; 8-lane zero/sign extended contiguous loads. + +define @masked_zload_sv8i8_to_sv8i16(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv8i8(* %base_addr, + i32 1, + %mask, + undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv8i8_to_sv8i16(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv8i8(* %base_addr, + i32 1, + %mask, + undef) + %ext = sext %load to + ret %ext +} + +; 8-lane truncating contiguous stores. + +define void @masked_trunc_store_sv8i16_to_sv8i8( %val, i8 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: +; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv8i8( %trunc, + *%base_addr, + i32 8, + %mask) + ret void +} + +; 16-lane contiguous load/stores. + +define void @test_masked_ldst_sv16i8(i8 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1] +; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %data = call @llvm.masked.load.nxv16i8(* %base_addr, + i32 1, + %mask, + undef) + call void @llvm.masked.store.nxv16i8( %data, + * %base_addr, + i32 1, + %mask) + ret void +} + +; 2-element contiguous loads. +declare @llvm.masked.load.nxv2i8 (* , i32, , ) +declare @llvm.masked.load.nxv2i16(*, i32, , ) +declare @llvm.masked.load.nxv2i32(*, i32, , ) +declare @llvm.masked.load.nxv2i64(*, i32, , ) +declare @llvm.masked.load.nxv2f16(*, i32, , ) +declare @llvm.masked.load.nxv2f32(*, i32, , ) +declare @llvm.masked.load.nxv2f64(*, i32, , ) + +; 4-element contiguous loads. +declare @llvm.masked.load.nxv4i8 (* , i32, , ) +declare @llvm.masked.load.nxv4i16(*, i32, , ) +declare @llvm.masked.load.nxv4i32(*, i32, , ) +declare @llvm.masked.load.nxv4f16(*, i32, , ) +declare @llvm.masked.load.nxv4f32(*, i32, , ) + +; 8-element contiguous loads. +declare @llvm.masked.load.nxv8i8 (* , i32, , ) +declare @llvm.masked.load.nxv8i16(*, i32, , ) +declare @llvm.masked.load.nxv8f16(*, i32, , ) + +; 16-element contiguous loads. +declare @llvm.masked.load.nxv16i8(*, i32, , ) + +; 2-element contiguous stores. +declare void @llvm.masked.store.nxv2i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv2i16(, *, i32, ) +declare void @llvm.masked.store.nxv2i32(, *, i32, ) +declare void @llvm.masked.store.nxv2i64(, *, i32, ) +declare void @llvm.masked.store.nxv2f16(, *, i32, ) +declare void @llvm.masked.store.nxv2f32(, *, i32, ) +declare void @llvm.masked.store.nxv2f64(, *, i32, ) + +; 4-element contiguous stores. +declare void @llvm.masked.store.nxv4i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv4i16(, *, i32, ) +declare void @llvm.masked.store.nxv4i32(, *, i32, ) +declare void @llvm.masked.store.nxv4f16(, *, i32, ) +declare void @llvm.masked.store.nxv4f32(, *, i32, ) + +; 8-element contiguous stores. +declare void @llvm.masked.store.nxv8i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv8i16(, *, i32, ) +declare void @llvm.masked.store.nxv8f16(, *, i32, ) + +; 16-element contiguous stores. +declare void @llvm.masked.store.nxv16i8(, *, i32, ) diff --git a/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll @@ -0,0 +1,171 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; Range checks: for all the instruction tested in this file, the +; immediate must be within the range [-8, 7] (4-bit immediate). Out of +; range values are tested only in one case (following). Valid values +; are tested all through the rest of the file. + +define void @imm_out_of_range( * %base, %mask) nounwind { +; CHECK-LABEL: imm_out_of_range: +; CHECK-NEXT: rdvl x8, #8 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}] +; CHECK-NEXT: rdvl x8, #-9 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x{{[0-9]+}}] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 8 + %data = call @llvm.aarch64.sve.ldnt1.nxv2i64( %mask, + * %base_load) + %base_store = getelementptr , * %base, i64 -9 + call void @llvm.aarch64.sve.stnt1.nxv2i64( %data, + %mask, + * %base_store) + ret void +} + +; 2-lane non-temporal load/stores + + +define void @test_masked_ldst_sv2i64( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.aarch64.sve.ldnt1.nxv2i64( %mask, + * %base_load) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.aarch64.sve.stnt1.nxv2i64( %data, + %mask, + * %base_store) + ret void +} + +define void @test_masked_ldst_sv2f64( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl] +; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, #-5, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -6 + %data = call @llvm.aarch64.sve.ldnt1.nxv2f64( %mask, + * %base_load) + %base_store = getelementptr , * %base, i64 -5 + call void @llvm.aarch64.sve.stnt1.nxv2f64( %data, + %mask, + * %base_store) + ret void +} + +; 4-lane non-temporal load/stores. + +define void @test_masked_ldst_sv4i32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.aarch64.sve.ldnt1.nxv4i32( %mask, + * %base_load) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.aarch64.sve.stnt1.nxv4i32( %data, + %mask, + * %base_store) + ret void +} + +define void @test_masked_ldst_sv4f32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.aarch64.sve.ldnt1.nxv4f32( %mask, + * %base_load) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.aarch64.sve.stnt1.nxv4f32( %data, + %mask, + * %base_store) + ret void +} + + +; 8-lane non-temporal load/stores. + +define void @test_masked_ldst_sv8i16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.aarch64.sve.ldnt1.nxv8i16( %mask, + * %base_load) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.aarch64.sve.stnt1.nxv8i16( %data, + %mask, + * %base_store) + ret void +} + +define void @test_masked_ldst_sv8f16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.aarch64.sve.ldnt1.nxv8f16( %mask, + * %base_load) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.aarch64.sve.stnt1.nxv8f16( %data, + %mask, + * %base_store) + ret void +} + +; 16-lane non-temporal load/stores. + +define void @test_masked_ldst_sv16i8( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK-NEXT: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: stnt1b { z[[DATA]].b }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.aarch64.sve.ldnt1.nxv16i8( %mask, + * %base_load) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.aarch64.sve.stnt1.nxv16i8( %data, + %mask, + * %base_store) + ret void +} + +; 2-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv2i64(, *) +declare @llvm.aarch64.sve.ldnt1.nxv2f64(, *) + +; 4-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv4i32(, *) +declare @llvm.aarch64.sve.ldnt1.nxv4f32(, *) + +; 8-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv8i16(, *) +declare @llvm.aarch64.sve.ldnt1.nxv8f16(, *) + +; 16-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv16i8(, *) + +; 2-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv2i64(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv2f64(, , *) + +; 4-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv4i32(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv4f32(, , *) + +; 8-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv8i16(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv8f16(, , *) + +; 16-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv16i8(, , *) + diff --git a/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll @@ -0,0 +1,145 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; 2-lane non-temporal load/stores + +define void @test_masked_ldst_sv2i64(i64* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %base_i64 = getelementptr i64, i64* %base, i64 %offset + %base_addr = bitcast i64* %base_i64 to * + %data = call @llvm.aarch64.sve.ldnt1.nxv2i64( %mask, + * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv2i64( %data, + %mask, + * %base_addr) + ret void +} + +define void @test_masked_ldst_sv2f64(double* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %base_double = getelementptr double, double* %base, i64 %offset + %base_addr = bitcast double* %base_double to * + %data = call @llvm.aarch64.sve.ldnt1.nxv2f64( %mask, + * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv2f64( %data, + %mask, + * %base_addr) + ret void +} + +; 4-lane non-temporal load/stores. + +define void @test_masked_ldst_sv4i32(i32* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %data = call @llvm.aarch64.sve.ldnt1.nxv4i32( %mask, + * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv4i32( %data, + %mask, + * %base_addr) + ret void +} + +define void @test_masked_ldst_sv4f32(float* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_float = getelementptr float, float* %base, i64 %offset + %base_addr = bitcast float* %base_float to * + %data = call @llvm.aarch64.sve.ldnt1.nxv4f32( %mask, + * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv4f32( %data, + %mask, + * %base_addr) + ret void +} + + +; 8-lane non-temporal load/stores. + +define void @test_masked_ldst_sv8i16(i16* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %data = call @llvm.aarch64.sve.ldnt1.nxv8i16( %mask, + * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv8i16( %data, + %mask, + * %base_addr) + ret void +} + +define void @test_masked_ldst_sv8f16(half* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_half = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_half to * + %data = call @llvm.aarch64.sve.ldnt1.nxv8f16( %mask, + * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv8f16( %data, + %mask, + * %base_addr) + ret void +} + +; 16-lane non-temporal load/stores. + +define void @test_masked_ldst_sv16i8(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK-NEXT: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1] +; CHECK-NEXT: stnt1b { z[[DATA]].b }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %data = call @llvm.aarch64.sve.ldnt1.nxv16i8( %mask, + * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv16i8( %data, + %mask, + * %base_addr) + ret void +} + +; 2-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv2i64(, *) +declare @llvm.aarch64.sve.ldnt1.nxv2f64(, *) + +; 4-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv4i32(, *) +declare @llvm.aarch64.sve.ldnt1.nxv4f32(, *) + +; 8-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv8i16(, *) +declare @llvm.aarch64.sve.ldnt1.nxv8f16(, *) + +; 16-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv16i8(, *) + +; 2-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv2i64(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv2f64(, , *) + +; 4-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv4i32(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv4f32(, , *) + +; 8-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv8i16(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv8f16(, , *) + +; 16-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv16i8(, , *)