diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3928,6 +3928,31 @@ const auto *CI = dyn_cast_or_null(C); if (CI && CI->isZero()) continue; + + // Constant integer offsets on scalable vectors pointers. + VectorType *VecTy = dyn_cast(GTI.getIndexedType()); + // VecTy->dump(); + if (VecTy && VecTy->isScalable() && !IsVectorGEP) { + EVT PTy = TLI.getPointerTy(DAG.getDataLayout(), AS); + + // If index is smaller or larger than intptr_t, truncate or extend it. + SDValue IdxN = getValue(Idx); + IdxN = DAG.getSExtOrTrunc(IdxN, dl, N.getValueType()); + + // Determine offset (in bits). + SDValue OffsVal; + if (CI) { + APInt MinSize = CI->getValue() * VecTy->getBitWidth(); + OffsVal = DAG.getVScale(dl, PTy, MinSize); + } else { + SDValue VS = + DAG.getVScale(dl, PTy, /*MulImm=*/APInt(PTy.getSizeInBits(), 1)); + OffsVal = DAG.getNode(ISD::MUL, dl, PTy, VS, IdxN); + } + N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal); + continue; + } + if (CI && !ElementScalable) { APInt Offs = ElementMul * CI->getValue().sextOrTrunc(IdxSize); LLVMContext &Context = *DAG.getContext(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -221,6 +221,9 @@ void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); + template + bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base, + SDValue &OffImm); void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); @@ -4399,3 +4402,41 @@ CodeGenOpt::Level OptLevel) { return new AArch64DAGToDAGISel(TM, OptLevel); } + +/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode: +/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max +/// where Root is the memory access using N for its address. +template +bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, + SDValue &Base, + SDValue &OffImm) { + if (!isa(Root)) + return false; + + EVT MemVT = cast(Root)->getMemoryVT(); + + if (N.getOpcode() != ISD::ADD) + return false; + + SDValue VScale = N.getOperand(1); + if (VScale.getOpcode() != ISD::VSCALE) + return false; + + TypeSize TS = MemVT.getSizeInBits(); + unsigned MemWidthBits = TS.getKnownMinSize(); + int64_t MulImm = cast(VScale.getOperand(0))->getSExtValue(); + + if ((MulImm % MemWidthBits) == 0) { + signed Offset = MulImm / MemWidthBits; + if ((Offset >= Min) && (Offset <= Max)) { + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64); + LLVM_DEBUG(dbgs() << "Match found:\n"; dbgs() << "ROOT:\n"; Root->dumpr(); + dbgs() << "BASE:\n"; Base.dumpr(); dbgs() << "OFFSET:\n"; + OffImm.dumpr()); + return true; + } + } + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1207,6 +1207,11 @@ multiclass pred_load { + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))), (RegImmInst PPR:$gp, GPR64:$base, (i64 0))>; } @@ -1243,6 +1248,11 @@ multiclass pred_store { + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)), (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; } @@ -1284,6 +1294,10 @@ multiclass unpred_store { def _fi : Pat<(store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + let AddedComplexity = 2 in { + def _reg_imm : Pat<(store(Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)), + (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } } defm Pat_ST1B : unpred_store; @@ -1297,6 +1311,10 @@ multiclass unpred_load { def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Ty (load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))), + (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } } defm Pat_LD1B : unpred_load; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -6920,3 +6920,7 @@ let Constraints = "$Zdn = $_Zdn"; } + +/// Addressing modes +def am_sve_indexed_s4 :ComplexPattern", [], [SDNPWantRoot]>; + diff --git a/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg+imm.ll b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg+imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg+imm.ll @@ -0,0 +1,486 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +%sv2i1 = type +%sv2i8 = type +%sv2i16 = type +%sv2i32 = type +%sv2i64 = type +%sv2f16 = type +%sv2f32 = type +%sv2f64 = type + +%sv4i1 = type +%sv4i8 = type +%sv4i16 = type +%sv4i32 = type +%sv4f16 = type +%sv4f32 = type + +%sv8i1 = type +%sv8i8 = type +%sv8i16 = type +%sv8f16 = type + +%sv16i1 = type +%sv16i8 = type + +; 2-element contiguous loads. +declare %sv2i8 @llvm.masked.load.nxv2i8 (%sv2i8* , i32, %sv2i1, %sv2i8 ) +declare %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16*, i32, %sv2i1, %sv2i16) +declare %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32*, i32, %sv2i1, %sv2i32) +declare %sv2i64 @llvm.masked.load.nxv2i64(%sv2i64*, i32, %sv2i1, %sv2i64) +declare %sv2f16 @llvm.masked.load.nxv2f16(%sv2f16*, i32, %sv2i1, %sv2f16) +declare %sv2f32 @llvm.masked.load.nxv2f32(%sv2f32*, i32, %sv2i1, %sv2f32) +declare %sv2f64 @llvm.masked.load.nxv2f64(%sv2f64*, i32, %sv2i1, %sv2f64) + +; 4-element contiguous loads. +declare %sv4i8 @llvm.masked.load.nxv4i8 (%sv4i8* , i32, %sv4i1, %sv4i8 ) +declare %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16*, i32, %sv4i1, %sv4i16) +declare %sv4i32 @llvm.masked.load.nxv4i32(%sv4i32*, i32, %sv4i1, %sv4i32) +declare %sv4f16 @llvm.masked.load.nxv4f16(%sv4f16*, i32, %sv4i1, %sv4f16) +declare %sv4f32 @llvm.masked.load.nxv4f32(%sv4f32*, i32, %sv4i1, %sv4f32) + +; 8-element contiguous loads. +declare %sv8i8 @llvm.masked.load.nxv8i8 (%sv8i8* , i32, %sv8i1, %sv8i8 ) +declare %sv8i16 @llvm.masked.load.nxv8i16(%sv8i16*, i32, %sv8i1, %sv8i16) +declare %sv8f16 @llvm.masked.load.nxv8f16(%sv8f16*, i32, %sv8i1, %sv8f16) + +; 16-element contiguous loads. +declare %sv16i8 @llvm.masked.load.nxv16i8(%sv16i8*, i32, %sv16i1, %sv16i8) + +; 2-element contiguous stores. +declare void @llvm.masked.store.nxv2i8 (%sv2i8 , %sv2i8* , i32, %sv2i1) +declare void @llvm.masked.store.nxv2i16(%sv2i16, %sv2i16*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2i32(%sv2i32, %sv2i32*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2i64(%sv2i64, %sv2i64*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2f16(%sv2f16, %sv2f16*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2f32(%sv2f32, %sv2f32*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2f64(%sv2f64, %sv2f64*, i32, %sv2i1) + +; 4-element contiguous stores. +declare void @llvm.masked.store.nxv4i8 (%sv4i8 , %sv4i8* , i32, %sv4i1) +declare void @llvm.masked.store.nxv4i16(%sv4i16, %sv4i16*, i32, %sv4i1) +declare void @llvm.masked.store.nxv4i32(%sv4i32, %sv4i32*, i32, %sv4i1) +declare void @llvm.masked.store.nxv4f16(%sv4f16, %sv4f16*, i32, %sv4i1) +declare void @llvm.masked.store.nxv4f32(%sv4f32, %sv4f32*, i32, %sv4i1) + +; 8-element contiguous stores. +declare void @llvm.masked.store.nxv8i8 (%sv8i8 , %sv8i8* , i32, %sv8i1) +declare void @llvm.masked.store.nxv8i16(%sv8i16, %sv8i16*, i32, %sv8i1) +declare void @llvm.masked.store.nxv8f16(%sv8f16, %sv8f16*, i32, %sv8i1) + +; 16-element contiguous stores. +declare void @llvm.masked.store.nxv16i8(%sv16i8, %sv16i8*, i32, %sv16i1) + +; Range checks: for all the instruction tested in this file, the +; immediate must be within the range [-8, 7] (4-bit immediate). Out of +; range values are tested only in one case (following). Valid values +; are tested all through the rest of the file. + +define void @imm_out_of_range(%sv2i64 * %base, %sv2i1 %mask) { +; CHECK-LABEL: imm_out_of_range: +; CHECK: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}] +; CHECK: st1d { z[[DATA]].d }, p0, [x{{[0-9]+}}] +; CHECK: ret + %base_load = getelementptr %sv2i64, %sv2i64* %base, i64 8 + %data = call %sv2i64 @llvm.masked.load.nxv2i64(%sv2i64* %base_load, i32 1, %sv2i1 %mask, %sv2i64 undef) + %base_store = getelementptr %sv2i64, %sv2i64 * %base, i64 -9 + call void @llvm.masked.store.nxv2i64(%sv2i64 %data, %sv2i64* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +; 2-lane contiguous load/stores + +define void @test_masked_ldst_sv2i8(%sv2i8 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2i8: +; CHECK: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1b { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2i8, %sv2i8* %base, i64 -8 + %data = call %sv2i8 @llvm.masked.load.nxv2i8(%sv2i8* %base_load, i32 1, %sv2i1 %mask, %sv2i8 undef) + %base_store = getelementptr %sv2i8, %sv2i8 * %base, i64 -7 + call void @llvm.masked.store.nxv2i8(%sv2i8 %data, %sv2i8* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2i16(%sv2i16 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2i16: +; CHECK: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2i16, %sv2i16* %base, i64 -8 + %data = call %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16* %base_load, i32 1, %sv2i1 %mask, %sv2i16 undef) + %base_store = getelementptr %sv2i16, %sv2i16 * %base, i64 -7 + call void @llvm.masked.store.nxv2i16(%sv2i16 %data, %sv2i16* %base_store, i32 1, %sv2i1 %mask) + ret void +} + + +define void @test_masked_ldst_sv2i32(%sv2i32 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2i32: +; CHECK: ld1sw { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2i32, %sv2i32* %base, i64 -8 + %data = call %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32* %base_load, i32 1, %sv2i1 %mask, %sv2i32 undef) + %base_store = getelementptr %sv2i32, %sv2i32 * %base, i64 -7 + call void @llvm.masked.store.nxv2i32(%sv2i32 %data, %sv2i32* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2i64(%sv2i64 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1d { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2i64, %sv2i64* %base, i64 -8 + %data = call %sv2i64 @llvm.masked.load.nxv2i64(%sv2i64* %base_load, i32 1, %sv2i1 %mask, %sv2i64 undef) + %base_store = getelementptr %sv2i64, %sv2i64 * %base, i64 -7 + call void @llvm.masked.store.nxv2i64(%sv2i64 %data, %sv2i64* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2f16(%sv2f16 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2f16: +; CHECK: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2f16, %sv2f16* %base, i64 -8 + %data = call %sv2f16 @llvm.masked.load.nxv2f16(%sv2f16* %base_load, i32 1, %sv2i1 %mask, %sv2f16 undef) + %base_store = getelementptr %sv2f16, %sv2f16 * %base, i64 -7 + call void @llvm.masked.store.nxv2f16(%sv2f16 %data, %sv2f16* %base_store, i32 1, %sv2i1 %mask) + ret void +} + + +define void @test_masked_ldst_sv2f32(%sv2f32 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2f32: +; CHECK: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2f32, %sv2f32* %base, i64 -8 + %data = call %sv2f32 @llvm.masked.load.nxv2f32(%sv2f32* %base_load, i32 1, %sv2i1 %mask, %sv2f32 undef) + %base_store = getelementptr %sv2f32, %sv2f32 * %base, i64 -7 + call void @llvm.masked.store.nxv2f32(%sv2f32 %data, %sv2f32* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2f64(%sv2f64 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl] +; CHECK: st1d { z[[DATA]].d }, p0, [x0, #-5, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2f64, %sv2f64* %base, i64 -6 + %data = call %sv2f64 @llvm.masked.load.nxv2f64(%sv2f64* %base_load, i32 1, %sv2i1 %mask, %sv2f64 undef) + %base_store = getelementptr %sv2f64, %sv2f64 * %base, i64 -5 + call void @llvm.masked.store.nxv2f64(%sv2f64 %data, %sv2f64* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +; 2-lane zero/sign extended contiguous loads. + +define %sv2i64 @masked_zload_sv2i8_to_sv2i64(%sv2i8* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: +; CHECK: ld1b { z0.d }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i8, %sv2i8* %base, i64 -4 + %load = call %sv2i8 @llvm.masked.load.nxv2i8(%sv2i8* %base_load, i32 1, %sv2i1 %mask, %sv2i8 undef) + %ext = zext %sv2i8 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_sload_sv2i8_to_sv2i64(%sv2i8* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: +; CHECK: ld1sb { z0.d }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i8, %sv2i8* %base, i64 -3 + %load = call %sv2i8 @llvm.masked.load.nxv2i8(%sv2i8* %base_load, i32 1, %sv2i1 %mask, %sv2i8 undef) + %ext = sext %sv2i8 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_zload_sv2i16_to_sv2i64(%sv2i16* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: +; CHECK: ld1h { z0.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i16, %sv2i16* %base, i64 1 + %load = call %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16* %base_load, i32 1, %sv2i1 %mask, %sv2i16 undef) + %ext = zext %sv2i16 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_sload_sv2i16_to_sv2i64(%sv2i16* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: +; CHECK: ld1sh { z0.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i16, %sv2i16* %base, i64 2 + %load = call %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16* %base_load, i32 1, %sv2i1 %mask, %sv2i16 undef) + %ext = sext %sv2i16 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_zload_sv2i32_to_sv2i64(%sv2i32* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: +; CHECK: ld1w { z0.d }, p0/z, [x0, #-2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i32, %sv2i32* %base, i64 -2 + %load = call %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32* %base_load, i32 1, %sv2i1 %mask, %sv2i32 undef) + %ext = zext %sv2i32 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_sload_sv2i32_to_sv2i64(%sv2i32* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: +; CHECK: ld1sw { z0.d }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i32, %sv2i32* %base, i64 -1 + %load = call %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32* %base_load, i32 1, %sv2i1 %mask, %sv2i32 undef) + %ext = sext %sv2i32 %load to %sv2i64 + ret %sv2i64 %ext +} + +; 2-lane truncating contiguous stores. + +define void @masked_trunc_store_sv2i64_to_sv2i8(%sv2i64 %val, %sv2i8 *%base, %sv2i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: +; CHECK: st1b { z0.d }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i8, %sv2i8* %base, i64 3 + %trunc = trunc %sv2i64 %val to %sv2i8 + call void @llvm.masked.store.nxv2i8(%sv2i8 %trunc, %sv2i8 *%base_load, i32 8, %sv2i1 %mask) + ret void +} + + +define void @masked_trunc_store_sv2i64_to_sv2i16(%sv2i64 %val, %sv2i16 *%base, %sv2i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: +; CHECK: st1h { z0.d }, p0, [x0, #4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i16, %sv2i16* %base, i64 4 + %trunc = trunc %sv2i64 %val to %sv2i16 + call void @llvm.masked.store.nxv2i16(%sv2i16 %trunc, %sv2i16 *%base_load, i32 8, %sv2i1 %mask) + ret void +} + +define void @masked_trunc_store_sv2i64_to_sv2i32(%sv2i64 %val, %sv2i32 *%base, %sv2i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: +; CHECK: st1w { z0.d }, p0, [x0, #5, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i32, %sv2i32* %base, i64 5 + %trunc = trunc %sv2i64 %val to %sv2i32 + call void @llvm.masked.store.nxv2i32(%sv2i32 %trunc, %sv2i32 *%base_load, i32 8, %sv2i1 %mask) + ret void +} + +; 4-lane contiguous load/stores. + +define void @test_masked_ldst_sv4i8(%sv4i8 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4i8: +; CHECK: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK: st1b { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4i8, %sv4i8* %base, i64 -1 + %data = call %sv4i8 @llvm.masked.load.nxv4i8(%sv4i8* %base_load, i32 1, %sv4i1 %mask, %sv4i8 undef) + %base_store = getelementptr %sv4i8, %sv4i8 * %base, i64 2 + call void @llvm.masked.store.nxv4i8(%sv4i8 %data, %sv4i8* %base_store, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4i16(%sv4i16 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4i16: +; CHECK: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4i16, %sv4i16* %base, i64 -1 + %data = call %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16* %base_load, i32 1, %sv4i1 %mask, %sv4i16 undef) + %base_store = getelementptr %sv4i16, %sv4i16 * %base, i64 2 + call void @llvm.masked.store.nxv4i16(%sv4i16 %data, %sv4i16* %base_store, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4i32(%sv4i32 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl] +; CHECK: st1w { z[[DATA]].s }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4i32, %sv4i32* %base, i64 6 + %data = call %sv4i32 @llvm.masked.load.nxv4i32(%sv4i32* %base_load, i32 1, %sv4i1 %mask, %sv4i32 undef) + %base_store = getelementptr %sv4i32, %sv4i32 * %base, i64 7 + call void @llvm.masked.store.nxv4i32(%sv4i32 %data, %sv4i32* %base_store, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4f16(%sv4f16 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4f16: +; CHECK: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4f16, %sv4f16* %base, i64 -1 + %data = call %sv4f16 @llvm.masked.load.nxv4f16(%sv4f16* %base_load, i32 1, %sv4i1 %mask, %sv4f16 undef) + %base_store = getelementptr %sv4f16, %sv4f16 * %base, i64 2 + call void @llvm.masked.store.nxv4f16(%sv4f16 %data, %sv4f16* %base_store, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4f32(%sv4f32 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK: st1w { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4f32, %sv4f32* %base, i64 -1 + %data = call %sv4f32 @llvm.masked.load.nxv4f32(%sv4f32* %base_load, i32 1, %sv4i1 %mask, %sv4f32 undef) + %base_store = getelementptr %sv4f32, %sv4f32 * %base, i64 2 + call void @llvm.masked.store.nxv4f32(%sv4f32 %data, %sv4f32* %base_store, i32 1, %sv4i1 %mask) + ret void +} + +; 4-lane zero/sign extended contiguous loads. + +define %sv4i32 @masked_zload_sv4i8_to_sv4i32(%sv4i8* %base, %sv4i1 %mask) { +; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: +; CHECK: ld1b { z0.s }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i8, %sv4i8* %base, i64 -4 + %load = call %sv4i8 @llvm.masked.load.nxv4i8(%sv4i8* %base_load, i32 1, %sv4i1 %mask, %sv4i8 undef) + %ext = zext %sv4i8 %load to %sv4i32 + ret %sv4i32 %ext +} + +define %sv4i32 @masked_sload_sv4i8_to_sv4i32(%sv4i8* %base, %sv4i1 %mask) { +; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: +; CHECK: ld1sb { z0.s }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i8, %sv4i8* %base, i64 -3 + %load = call %sv4i8 @llvm.masked.load.nxv4i8(%sv4i8* %base_load, i32 1, %sv4i1 %mask, %sv4i8 undef) + %ext = sext %sv4i8 %load to %sv4i32 + ret %sv4i32 %ext +} + +define %sv4i32 @masked_zload_sv4i16_to_sv4i32(%sv4i16* %base, %sv4i1 %mask) { +; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: +; CHECK: ld1h { z0.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i16, %sv4i16* %base, i64 1 + %load = call %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16* %base_load, i32 1, %sv4i1 %mask, %sv4i16 undef) + %ext = zext %sv4i16 %load to %sv4i32 + ret %sv4i32 %ext +} + +define %sv4i32 @masked_sload_sv4i16_to_sv4i32(%sv4i16* %base, %sv4i1 %mask) { +; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: +; CHECK: ld1sh { z0.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i16, %sv4i16* %base, i64 2 + %load = call %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16* %base_load, i32 1, %sv4i1 %mask, %sv4i16 undef) + %ext = sext %sv4i16 %load to %sv4i32 + ret %sv4i32 %ext +} + +; 4-lane truncating contiguous stores. + +define void @masked_trunc_store_sv4i32_to_sv4i8(%sv4i32 %val, %sv4i8 *%base, %sv4i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: +; CHECK: st1b { z0.s }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i8, %sv4i8* %base, i64 3 + %trunc = trunc %sv4i32 %val to %sv4i8 + call void @llvm.masked.store.nxv4i8(%sv4i8 %trunc, %sv4i8 *%base_load, i32 8, %sv4i1 %mask) + ret void +} + + +define void @masked_trunc_store_sv4i32_to_sv4i16(%sv4i32 %val, %sv4i16 *%base, %sv4i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: +; CHECK: st1h { z0.s }, p0, [x0, #4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i16, %sv4i16* %base, i64 4 + %trunc = trunc %sv4i32 %val to %sv4i16 + call void @llvm.masked.store.nxv4i16(%sv4i16 %trunc, %sv4i16 *%base_load, i32 8, %sv4i1 %mask) + ret void +} + +; 8-lane contiguous load/stores. + +define void @test_masked_ldst_sv8i8(%sv8i8 * %base, %sv8i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv8i8: +; CHECK: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK: st1b { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv8i8, %sv8i8* %base, i64 6 + %data = call %sv8i8 @llvm.masked.load.nxv8i8(%sv8i8* %base_load, i32 1, %sv8i1 %mask, %sv8i8 undef) + %base_store = getelementptr %sv8i8, %sv8i8 * %base, i64 7 + call void @llvm.masked.store.nxv8i8(%sv8i8 %data, %sv8i8* %base_store, i32 1, %sv8i1 %mask) + ret void +} + +define void @test_masked_ldst_sv8i16(%sv8i16 * %base, %sv8i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK: st1h { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv8i16, %sv8i16* %base, i64 6 + %data = call %sv8i16 @llvm.masked.load.nxv8i16(%sv8i16* %base_load, i32 1, %sv8i1 %mask, %sv8i16 undef) + %base_store = getelementptr %sv8i16, %sv8i16 * %base, i64 7 + call void @llvm.masked.store.nxv8i16(%sv8i16 %data, %sv8i16* %base_store, i32 1, %sv8i1 %mask) + ret void +} + +define void @test_masked_ldst_sv8f16(%sv8f16 * %base, %sv8i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] +; CHECK: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv8f16, %sv8f16* %base, i64 -1 + %data = call %sv8f16 @llvm.masked.load.nxv8f16(%sv8f16* %base_load, i32 1, %sv8i1 %mask, %sv8f16 undef) + %base_store = getelementptr %sv8f16, %sv8f16 * %base, i64 2 + call void @llvm.masked.store.nxv8f16(%sv8f16 %data, %sv8f16* %base_store, i32 1, %sv8i1 %mask) + ret void +} + +; 8-lane zero/sign extended contiguous loads. + +define %sv8i16 @masked_zload_sv8i8_to_sv8i16(%sv8i8* %base, %sv8i1 %mask) { +; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: +; CHECK: ld1b { z0.h }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv8i8, %sv8i8* %base, i64 -4 + %load = call %sv8i8 @llvm.masked.load.nxv8i8(%sv8i8* %base_load, i32 1, %sv8i1 %mask, %sv8i8 undef) + %ext = zext %sv8i8 %load to %sv8i16 + ret %sv8i16 %ext +} + +define %sv8i16 @masked_sload_sv8i8_to_sv8i16(%sv8i8* %base, %sv8i1 %mask) { +; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: +; CHECK: ld1sb { z0.h }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv8i8, %sv8i8* %base, i64 -3 + %load = call %sv8i8 @llvm.masked.load.nxv8i8(%sv8i8* %base_load, i32 1, %sv8i1 %mask, %sv8i8 undef) + %ext = sext %sv8i8 %load to %sv8i16 + ret %sv8i16 %ext +} + +; 8-lane truncating contiguous stores. + +define void @masked_trunc_store_sv8i16_to_sv8i8(%sv8i16 %val, %sv8i8 *%base, %sv8i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: +; CHECK: st1b { z0.h }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv8i8, %sv8i8* %base, i64 3 + %trunc = trunc %sv8i16 %val to %sv8i8 + call void @llvm.masked.store.nxv8i8(%sv8i8 %trunc, %sv8i8 *%base_load, i32 8, %sv8i1 %mask) + ret void +} + +; 16-lane contiguous load/stores. + +define void @test_masked_ldst_sv16i8(%sv16i8 * %base, %sv16i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl] +; CHECK: st1b { z[[DATA]].b }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv16i8, %sv16i8* %base, i64 6 + %data = call %sv16i8 @llvm.masked.load.nxv16i8(%sv16i8* %base_load, i32 1, %sv16i1 %mask, %sv16i8 undef) + %base_store = getelementptr %sv16i8, %sv16i8 * %base, i64 7 + call void @llvm.masked.store.nxv16i8(%sv16i8 %data, %sv16i8* %base_store, i32 1, %sv16i1 %mask) + ret void +}