Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2327,6 +2327,13 @@ DAG.haveNoCommonBitsSet(N0, N1)) return DAG.getNode(ISD::OR, DL, VT, N0, N1); + // Fold (add (vscale * C0), (vscale C1)) to (vscale C0 + C1)) + if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) { + APInt C0 = N0->getConstantOperandAPInt(0); + APInt C1 = N1->getConstantOperandAPInt(0); + return DAG.getVScale(DL, VT, C0 + C1); + } + return SDValue(); } @@ -3250,6 +3257,12 @@ } } + // canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C)) + if (N1.getOpcode() == ISD::VSCALE) { + APInt IntVal = N1.getConstantOperandAPInt(0); + return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal)); + } + // Prefer an add for more folding potential and possibly better codegen: // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1) if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) { @@ -3585,6 +3598,14 @@ DAG.getNode(ISD::MUL, SDLoc(N1), VT, N0.getOperand(1), N1)); + // Fold (mul (vscale * C0), C1) to (vscale C0 * C1)) + if (N0.getOpcode() == ISD::VSCALE) + if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) { + APInt C0 = N0.getConstantOperandAPInt(0); + APInt C1 = NC1->getAPIntValue(); + return DAG.getVScale(SDLoc(N), VT, C0 * C1); + } + // reassociate mul if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) return RMUL; @@ -7706,6 +7727,15 @@ if (SDValue NewSHL = visitShiftByConstant(N)) return NewSHL; + // Fold (shl (vscale * C0), C1) to (vscale C0 << C1)) + if (N0.getOpcode() == ISD::VSCALE) + if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) { + auto DL = SDLoc(N); + APInt C0 = N0.getConstantOperandAPInt(0); + APInt C1 = NC1->getAPIntValue(); + return DAG.getVScale(DL, VT, C0 << C1); + } + return SDValue(); } Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -221,6 +221,14 @@ void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); + template + bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base, + SDValue &OffImm); + /// SVE Reg+Reg address mode + template + bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) { + return SelectSVERegRegAddrMode(N, Scale, Base, Offset); + } void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); @@ -280,6 +288,8 @@ bool SelectSVESignedArithImm(SDValue N, SDValue &Imm); bool SelectSVEArithImm(SDValue N, SDValue &Imm); + bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, + SDValue &Offset); }; } // end anonymous namespace @@ -4399,3 +4409,84 @@ CodeGenOpt::Level OptLevel) { return new AArch64DAGToDAGISel(TM, OptLevel); } + +/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode: +/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max +/// where Root is the memory access using N for its address. +template +bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, + SDValue &Base, + SDValue &OffImm) { + if (!isa(Root)) + return false; + + EVT MemVT = cast(Root)->getMemoryVT(); + + if (N.getOpcode() != ISD::ADD) + return false; + + SDValue VScale = N.getOperand(1); + if (VScale.getOpcode() != ISD::VSCALE) + return false; + + TypeSize TS = MemVT.getSizeInBits(); + unsigned MemWidthBytes = TS.getKnownMinSize() / 8; + int64_t MulImm = cast(VScale.getOperand(0))->getSExtValue(); + + if ((MulImm % MemWidthBytes) == 0) { + signed Offset = MulImm / MemWidthBytes; + if ((Offset >= Min) && (Offset <= Max)) { + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64); + LLVM_DEBUG(dbgs() << "Match found:\n"; dbgs() << "ROOT:\n"; Root->dumpr(); + dbgs() << "BASE:\n"; Base.dumpr(); dbgs() << "OFFSET:\n"; + OffImm.dumpr()); + return true; + } + } + + return false; +} + +bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale, + SDValue &Base, + SDValue &Offset) { + const unsigned Opcode = N.getOpcode(); + const SDLoc dl(N); + + if (Opcode != ISD::ADD) + return false; + + // Process an ADD node + const SDValue LHS = N.getOperand(0); + const SDValue RHS = N.getOperand(1); + + // We don't match addition to constants + if (isa(RHS) || isa(LHS)) + return false; + + // 8 bit data don't have the SHL node, so we treat it separately. + if (Scale == 0) { + Base = LHS; + Offset = RHS; + return true; + } + + // Check if the RHS is a shift node with a constant. + if (RHS.getOpcode() == ISD::SHL) { + const SDValue SRHS = RHS.getOperand(1); + if (auto C = dyn_cast(SRHS)) { + const uint64_t Shift = C->getZExtValue(); + if (Shift == Scale) { + Base = LHS; + Offset = RHS.getOperand(0); + LLVM_DEBUG(dbgs() << "Match found, rewriting as:\n" + << "BASE:\n"; + Base.dumpr(); dbgs() << "OFFSET:\n"; Offset.dumpr()); + return true; + } + } + } + + return false; +} Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1203,85 +1203,108 @@ // Add more complex addressing modes here as required multiclass pred_load { - + Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))), (RegImmInst PPR:$gp, GPR64:$base, (i64 0))>; } // 2-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 4-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 8-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 16-element contiguous loads - defm : pred_load; + defm : pred_load; multiclass pred_store { + Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)), + (RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)), (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; } // 2-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 4-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 8-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 16-element contiguous stores - defm : pred_store; + defm : pred_store; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; multiclass unpred_store { def _fi : Pat<(store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + let AddedComplexity = 2 in { + def _reg_imm : Pat<(store(Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)), + (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } } defm Pat_ST1B : unpred_store; @@ -1295,6 +1318,10 @@ multiclass unpred_load { def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Ty (load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))), + (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } } defm Pat_LD1B : unpred_load; Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -6974,3 +6974,11 @@ let Constraints = "$Zdn = $_Zdn"; } + +/// Addressing modes +def am_sve_indexed_s4 :ComplexPattern", [], [SDNPWantRoot]>; + +def SVEAddrModeRegReg8 : ComplexPattern", []>; +def SVEAddrModeRegReg16 : ComplexPattern", []>; +def SVEAddrModeRegReg32 : ComplexPattern", []>; +def SVEAddrModeRegReg64 : ComplexPattern", []>; Index: llvm/test/CodeGen/AArch64/sve-gep.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-gep.ll +++ llvm/test/CodeGen/AArch64/sve-gep.ll @@ -4,8 +4,8 @@ define * @scalar_of_scalable_1(* %base) { ; CHECK-LABEL: scalar_of_scalable_1: ; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: add x0, x0, x8, lsl #2 +; CHECK-NEXT: rdvl x8, #4 +; CHECK-NEXT: add x0, x0, x8 ; CHECK-NEXT: ret %d = getelementptr , * %base, i64 4 ret * %d Index: llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg+imm.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg+imm.ll @@ -0,0 +1,486 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +%sv2i1 = type +%sv2i8 = type +%sv2i16 = type +%sv2i32 = type +%sv2i64 = type +%sv2f16 = type +%sv2f32 = type +%sv2f64 = type + +%sv4i1 = type +%sv4i8 = type +%sv4i16 = type +%sv4i32 = type +%sv4f16 = type +%sv4f32 = type + +%sv8i1 = type +%sv8i8 = type +%sv8i16 = type +%sv8f16 = type + +%sv16i1 = type +%sv16i8 = type + +; 2-element contiguous loads. +declare %sv2i8 @llvm.masked.load.nxv2i8 (%sv2i8* , i32, %sv2i1, %sv2i8 ) +declare %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16*, i32, %sv2i1, %sv2i16) +declare %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32*, i32, %sv2i1, %sv2i32) +declare %sv2i64 @llvm.masked.load.nxv2i64(%sv2i64*, i32, %sv2i1, %sv2i64) +declare %sv2f16 @llvm.masked.load.nxv2f16(%sv2f16*, i32, %sv2i1, %sv2f16) +declare %sv2f32 @llvm.masked.load.nxv2f32(%sv2f32*, i32, %sv2i1, %sv2f32) +declare %sv2f64 @llvm.masked.load.nxv2f64(%sv2f64*, i32, %sv2i1, %sv2f64) + +; 4-element contiguous loads. +declare %sv4i8 @llvm.masked.load.nxv4i8 (%sv4i8* , i32, %sv4i1, %sv4i8 ) +declare %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16*, i32, %sv4i1, %sv4i16) +declare %sv4i32 @llvm.masked.load.nxv4i32(%sv4i32*, i32, %sv4i1, %sv4i32) +declare %sv4f16 @llvm.masked.load.nxv4f16(%sv4f16*, i32, %sv4i1, %sv4f16) +declare %sv4f32 @llvm.masked.load.nxv4f32(%sv4f32*, i32, %sv4i1, %sv4f32) + +; 8-element contiguous loads. +declare %sv8i8 @llvm.masked.load.nxv8i8 (%sv8i8* , i32, %sv8i1, %sv8i8 ) +declare %sv8i16 @llvm.masked.load.nxv8i16(%sv8i16*, i32, %sv8i1, %sv8i16) +declare %sv8f16 @llvm.masked.load.nxv8f16(%sv8f16*, i32, %sv8i1, %sv8f16) + +; 16-element contiguous loads. +declare %sv16i8 @llvm.masked.load.nxv16i8(%sv16i8*, i32, %sv16i1, %sv16i8) + +; 2-element contiguous stores. +declare void @llvm.masked.store.nxv2i8 (%sv2i8 , %sv2i8* , i32, %sv2i1) +declare void @llvm.masked.store.nxv2i16(%sv2i16, %sv2i16*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2i32(%sv2i32, %sv2i32*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2i64(%sv2i64, %sv2i64*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2f16(%sv2f16, %sv2f16*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2f32(%sv2f32, %sv2f32*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2f64(%sv2f64, %sv2f64*, i32, %sv2i1) + +; 4-element contiguous stores. +declare void @llvm.masked.store.nxv4i8 (%sv4i8 , %sv4i8* , i32, %sv4i1) +declare void @llvm.masked.store.nxv4i16(%sv4i16, %sv4i16*, i32, %sv4i1) +declare void @llvm.masked.store.nxv4i32(%sv4i32, %sv4i32*, i32, %sv4i1) +declare void @llvm.masked.store.nxv4f16(%sv4f16, %sv4f16*, i32, %sv4i1) +declare void @llvm.masked.store.nxv4f32(%sv4f32, %sv4f32*, i32, %sv4i1) + +; 8-element contiguous stores. +declare void @llvm.masked.store.nxv8i8 (%sv8i8 , %sv8i8* , i32, %sv8i1) +declare void @llvm.masked.store.nxv8i16(%sv8i16, %sv8i16*, i32, %sv8i1) +declare void @llvm.masked.store.nxv8f16(%sv8f16, %sv8f16*, i32, %sv8i1) + +; 16-element contiguous stores. +declare void @llvm.masked.store.nxv16i8(%sv16i8, %sv16i8*, i32, %sv16i1) + +; Range checks: for all the instruction tested in this file, the +; immediate must be within the range [-8, 7] (4-bit immediate). Out of +; range values are tested only in one case (following). Valid values +; are tested all through the rest of the file. + +define void @imm_out_of_range(%sv2i64 * %base, %sv2i1 %mask) { +; CHECK-LABEL: imm_out_of_range: +; CHECK: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}] +; CHECK: st1d { z[[DATA]].d }, p0, [x{{[0-9]+}}] +; CHECK: ret + %base_load = getelementptr %sv2i64, %sv2i64* %base, i64 8 + %data = call %sv2i64 @llvm.masked.load.nxv2i64(%sv2i64* %base_load, i32 1, %sv2i1 %mask, %sv2i64 undef) + %base_store = getelementptr %sv2i64, %sv2i64 * %base, i64 -9 + call void @llvm.masked.store.nxv2i64(%sv2i64 %data, %sv2i64* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +; 2-lane contiguous load/stores + +define void @test_masked_ldst_sv2i8(%sv2i8 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2i8: +; CHECK: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1b { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2i8, %sv2i8* %base, i64 -8 + %data = call %sv2i8 @llvm.masked.load.nxv2i8(%sv2i8* %base_load, i32 1, %sv2i1 %mask, %sv2i8 undef) + %base_store = getelementptr %sv2i8, %sv2i8 * %base, i64 -7 + call void @llvm.masked.store.nxv2i8(%sv2i8 %data, %sv2i8* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2i16(%sv2i16 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2i16: +; CHECK: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2i16, %sv2i16* %base, i64 -8 + %data = call %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16* %base_load, i32 1, %sv2i1 %mask, %sv2i16 undef) + %base_store = getelementptr %sv2i16, %sv2i16 * %base, i64 -7 + call void @llvm.masked.store.nxv2i16(%sv2i16 %data, %sv2i16* %base_store, i32 1, %sv2i1 %mask) + ret void +} + + +define void @test_masked_ldst_sv2i32(%sv2i32 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2i32: +; CHECK: ld1sw { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2i32, %sv2i32* %base, i64 -8 + %data = call %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32* %base_load, i32 1, %sv2i1 %mask, %sv2i32 undef) + %base_store = getelementptr %sv2i32, %sv2i32 * %base, i64 -7 + call void @llvm.masked.store.nxv2i32(%sv2i32 %data, %sv2i32* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2i64(%sv2i64 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1d { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2i64, %sv2i64* %base, i64 -8 + %data = call %sv2i64 @llvm.masked.load.nxv2i64(%sv2i64* %base_load, i32 1, %sv2i1 %mask, %sv2i64 undef) + %base_store = getelementptr %sv2i64, %sv2i64 * %base, i64 -7 + call void @llvm.masked.store.nxv2i64(%sv2i64 %data, %sv2i64* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2f16(%sv2f16 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2f16: +; CHECK: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2f16, %sv2f16* %base, i64 -8 + %data = call %sv2f16 @llvm.masked.load.nxv2f16(%sv2f16* %base_load, i32 1, %sv2i1 %mask, %sv2f16 undef) + %base_store = getelementptr %sv2f16, %sv2f16 * %base, i64 -7 + call void @llvm.masked.store.nxv2f16(%sv2f16 %data, %sv2f16* %base_store, i32 1, %sv2i1 %mask) + ret void +} + + +define void @test_masked_ldst_sv2f32(%sv2f32 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2f32: +; CHECK: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2f32, %sv2f32* %base, i64 -8 + %data = call %sv2f32 @llvm.masked.load.nxv2f32(%sv2f32* %base_load, i32 1, %sv2i1 %mask, %sv2f32 undef) + %base_store = getelementptr %sv2f32, %sv2f32 * %base, i64 -7 + call void @llvm.masked.store.nxv2f32(%sv2f32 %data, %sv2f32* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2f64(%sv2f64 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl] +; CHECK: st1d { z[[DATA]].d }, p0, [x0, #-5, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2f64, %sv2f64* %base, i64 -6 + %data = call %sv2f64 @llvm.masked.load.nxv2f64(%sv2f64* %base_load, i32 1, %sv2i1 %mask, %sv2f64 undef) + %base_store = getelementptr %sv2f64, %sv2f64 * %base, i64 -5 + call void @llvm.masked.store.nxv2f64(%sv2f64 %data, %sv2f64* %base_store, i32 1, %sv2i1 %mask) + ret void +} + +; 2-lane zero/sign extended contiguous loads. + +define %sv2i64 @masked_zload_sv2i8_to_sv2i64(%sv2i8* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: +; CHECK: ld1b { z0.d }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i8, %sv2i8* %base, i64 -4 + %load = call %sv2i8 @llvm.masked.load.nxv2i8(%sv2i8* %base_load, i32 1, %sv2i1 %mask, %sv2i8 undef) + %ext = zext %sv2i8 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_sload_sv2i8_to_sv2i64(%sv2i8* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: +; CHECK: ld1sb { z0.d }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i8, %sv2i8* %base, i64 -3 + %load = call %sv2i8 @llvm.masked.load.nxv2i8(%sv2i8* %base_load, i32 1, %sv2i1 %mask, %sv2i8 undef) + %ext = sext %sv2i8 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_zload_sv2i16_to_sv2i64(%sv2i16* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: +; CHECK: ld1h { z0.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i16, %sv2i16* %base, i64 1 + %load = call %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16* %base_load, i32 1, %sv2i1 %mask, %sv2i16 undef) + %ext = zext %sv2i16 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_sload_sv2i16_to_sv2i64(%sv2i16* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: +; CHECK: ld1sh { z0.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i16, %sv2i16* %base, i64 2 + %load = call %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16* %base_load, i32 1, %sv2i1 %mask, %sv2i16 undef) + %ext = sext %sv2i16 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_zload_sv2i32_to_sv2i64(%sv2i32* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: +; CHECK: ld1w { z0.d }, p0/z, [x0, #-2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i32, %sv2i32* %base, i64 -2 + %load = call %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32* %base_load, i32 1, %sv2i1 %mask, %sv2i32 undef) + %ext = zext %sv2i32 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_sload_sv2i32_to_sv2i64(%sv2i32* %base, %sv2i1 %mask) { +; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: +; CHECK: ld1sw { z0.d }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i32, %sv2i32* %base, i64 -1 + %load = call %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32* %base_load, i32 1, %sv2i1 %mask, %sv2i32 undef) + %ext = sext %sv2i32 %load to %sv2i64 + ret %sv2i64 %ext +} + +; 2-lane truncating contiguous stores. + +define void @masked_trunc_store_sv2i64_to_sv2i8(%sv2i64 %val, %sv2i8 *%base, %sv2i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: +; CHECK: st1b { z0.d }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i8, %sv2i8* %base, i64 3 + %trunc = trunc %sv2i64 %val to %sv2i8 + call void @llvm.masked.store.nxv2i8(%sv2i8 %trunc, %sv2i8 *%base_load, i32 8, %sv2i1 %mask) + ret void +} + + +define void @masked_trunc_store_sv2i64_to_sv2i16(%sv2i64 %val, %sv2i16 *%base, %sv2i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: +; CHECK: st1h { z0.d }, p0, [x0, #4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i16, %sv2i16* %base, i64 4 + %trunc = trunc %sv2i64 %val to %sv2i16 + call void @llvm.masked.store.nxv2i16(%sv2i16 %trunc, %sv2i16 *%base_load, i32 8, %sv2i1 %mask) + ret void +} + +define void @masked_trunc_store_sv2i64_to_sv2i32(%sv2i64 %val, %sv2i32 *%base, %sv2i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: +; CHECK: st1w { z0.d }, p0, [x0, #5, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv2i32, %sv2i32* %base, i64 5 + %trunc = trunc %sv2i64 %val to %sv2i32 + call void @llvm.masked.store.nxv2i32(%sv2i32 %trunc, %sv2i32 *%base_load, i32 8, %sv2i1 %mask) + ret void +} + +; 4-lane contiguous load/stores. + +define void @test_masked_ldst_sv4i8(%sv4i8 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4i8: +; CHECK: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK: st1b { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4i8, %sv4i8* %base, i64 -1 + %data = call %sv4i8 @llvm.masked.load.nxv4i8(%sv4i8* %base_load, i32 1, %sv4i1 %mask, %sv4i8 undef) + %base_store = getelementptr %sv4i8, %sv4i8 * %base, i64 2 + call void @llvm.masked.store.nxv4i8(%sv4i8 %data, %sv4i8* %base_store, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4i16(%sv4i16 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4i16: +; CHECK: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4i16, %sv4i16* %base, i64 -1 + %data = call %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16* %base_load, i32 1, %sv4i1 %mask, %sv4i16 undef) + %base_store = getelementptr %sv4i16, %sv4i16 * %base, i64 2 + call void @llvm.masked.store.nxv4i16(%sv4i16 %data, %sv4i16* %base_store, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4i32(%sv4i32 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl] +; CHECK: st1w { z[[DATA]].s }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4i32, %sv4i32* %base, i64 6 + %data = call %sv4i32 @llvm.masked.load.nxv4i32(%sv4i32* %base_load, i32 1, %sv4i1 %mask, %sv4i32 undef) + %base_store = getelementptr %sv4i32, %sv4i32 * %base, i64 7 + call void @llvm.masked.store.nxv4i32(%sv4i32 %data, %sv4i32* %base_store, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4f16(%sv4f16 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4f16: +; CHECK: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4f16, %sv4f16* %base, i64 -1 + %data = call %sv4f16 @llvm.masked.load.nxv4f16(%sv4f16* %base_load, i32 1, %sv4i1 %mask, %sv4f16 undef) + %base_store = getelementptr %sv4f16, %sv4f16 * %base, i64 2 + call void @llvm.masked.store.nxv4f16(%sv4f16 %data, %sv4f16* %base_store, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4f32(%sv4f32 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK: st1w { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4f32, %sv4f32* %base, i64 -1 + %data = call %sv4f32 @llvm.masked.load.nxv4f32(%sv4f32* %base_load, i32 1, %sv4i1 %mask, %sv4f32 undef) + %base_store = getelementptr %sv4f32, %sv4f32 * %base, i64 2 + call void @llvm.masked.store.nxv4f32(%sv4f32 %data, %sv4f32* %base_store, i32 1, %sv4i1 %mask) + ret void +} + +; 4-lane zero/sign extended contiguous loads. + +define %sv4i32 @masked_zload_sv4i8_to_sv4i32(%sv4i8* %base, %sv4i1 %mask) { +; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: +; CHECK: ld1b { z0.s }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i8, %sv4i8* %base, i64 -4 + %load = call %sv4i8 @llvm.masked.load.nxv4i8(%sv4i8* %base_load, i32 1, %sv4i1 %mask, %sv4i8 undef) + %ext = zext %sv4i8 %load to %sv4i32 + ret %sv4i32 %ext +} + +define %sv4i32 @masked_sload_sv4i8_to_sv4i32(%sv4i8* %base, %sv4i1 %mask) { +; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: +; CHECK: ld1sb { z0.s }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i8, %sv4i8* %base, i64 -3 + %load = call %sv4i8 @llvm.masked.load.nxv4i8(%sv4i8* %base_load, i32 1, %sv4i1 %mask, %sv4i8 undef) + %ext = sext %sv4i8 %load to %sv4i32 + ret %sv4i32 %ext +} + +define %sv4i32 @masked_zload_sv4i16_to_sv4i32(%sv4i16* %base, %sv4i1 %mask) { +; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: +; CHECK: ld1h { z0.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i16, %sv4i16* %base, i64 1 + %load = call %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16* %base_load, i32 1, %sv4i1 %mask, %sv4i16 undef) + %ext = zext %sv4i16 %load to %sv4i32 + ret %sv4i32 %ext +} + +define %sv4i32 @masked_sload_sv4i16_to_sv4i32(%sv4i16* %base, %sv4i1 %mask) { +; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: +; CHECK: ld1sh { z0.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i16, %sv4i16* %base, i64 2 + %load = call %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16* %base_load, i32 1, %sv4i1 %mask, %sv4i16 undef) + %ext = sext %sv4i16 %load to %sv4i32 + ret %sv4i32 %ext +} + +; 4-lane truncating contiguous stores. + +define void @masked_trunc_store_sv4i32_to_sv4i8(%sv4i32 %val, %sv4i8 *%base, %sv4i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: +; CHECK: st1b { z0.s }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i8, %sv4i8* %base, i64 3 + %trunc = trunc %sv4i32 %val to %sv4i8 + call void @llvm.masked.store.nxv4i8(%sv4i8 %trunc, %sv4i8 *%base_load, i32 8, %sv4i1 %mask) + ret void +} + + +define void @masked_trunc_store_sv4i32_to_sv4i16(%sv4i32 %val, %sv4i16 *%base, %sv4i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: +; CHECK: st1h { z0.s }, p0, [x0, #4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv4i16, %sv4i16* %base, i64 4 + %trunc = trunc %sv4i32 %val to %sv4i16 + call void @llvm.masked.store.nxv4i16(%sv4i16 %trunc, %sv4i16 *%base_load, i32 8, %sv4i1 %mask) + ret void +} + +; 8-lane contiguous load/stores. + +define void @test_masked_ldst_sv8i8(%sv8i8 * %base, %sv8i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv8i8: +; CHECK: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK: st1b { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv8i8, %sv8i8* %base, i64 6 + %data = call %sv8i8 @llvm.masked.load.nxv8i8(%sv8i8* %base_load, i32 1, %sv8i1 %mask, %sv8i8 undef) + %base_store = getelementptr %sv8i8, %sv8i8 * %base, i64 7 + call void @llvm.masked.store.nxv8i8(%sv8i8 %data, %sv8i8* %base_store, i32 1, %sv8i1 %mask) + ret void +} + +define void @test_masked_ldst_sv8i16(%sv8i16 * %base, %sv8i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK: st1h { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv8i16, %sv8i16* %base, i64 6 + %data = call %sv8i16 @llvm.masked.load.nxv8i16(%sv8i16* %base_load, i32 1, %sv8i1 %mask, %sv8i16 undef) + %base_store = getelementptr %sv8i16, %sv8i16 * %base, i64 7 + call void @llvm.masked.store.nxv8i16(%sv8i16 %data, %sv8i16* %base_store, i32 1, %sv8i1 %mask) + ret void +} + +define void @test_masked_ldst_sv8f16(%sv8f16 * %base, %sv8i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] +; CHECK: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv8f16, %sv8f16* %base, i64 -1 + %data = call %sv8f16 @llvm.masked.load.nxv8f16(%sv8f16* %base_load, i32 1, %sv8i1 %mask, %sv8f16 undef) + %base_store = getelementptr %sv8f16, %sv8f16 * %base, i64 2 + call void @llvm.masked.store.nxv8f16(%sv8f16 %data, %sv8f16* %base_store, i32 1, %sv8i1 %mask) + ret void +} + +; 8-lane zero/sign extended contiguous loads. + +define %sv8i16 @masked_zload_sv8i8_to_sv8i16(%sv8i8* %base, %sv8i1 %mask) { +; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: +; CHECK: ld1b { z0.h }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv8i8, %sv8i8* %base, i64 -4 + %load = call %sv8i8 @llvm.masked.load.nxv8i8(%sv8i8* %base_load, i32 1, %sv8i1 %mask, %sv8i8 undef) + %ext = zext %sv8i8 %load to %sv8i16 + ret %sv8i16 %ext +} + +define %sv8i16 @masked_sload_sv8i8_to_sv8i16(%sv8i8* %base, %sv8i1 %mask) { +; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: +; CHECK: ld1sb { z0.h }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv8i8, %sv8i8* %base, i64 -3 + %load = call %sv8i8 @llvm.masked.load.nxv8i8(%sv8i8* %base_load, i32 1, %sv8i1 %mask, %sv8i8 undef) + %ext = sext %sv8i8 %load to %sv8i16 + ret %sv8i16 %ext +} + +; 8-lane truncating contiguous stores. + +define void @masked_trunc_store_sv8i16_to_sv8i8(%sv8i16 %val, %sv8i8 *%base, %sv8i1 %mask) { +; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: +; CHECK: st1b { z0.h }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr %sv8i8, %sv8i8* %base, i64 3 + %trunc = trunc %sv8i16 %val to %sv8i8 + call void @llvm.masked.store.nxv8i8(%sv8i8 %trunc, %sv8i8 *%base_load, i32 8, %sv8i1 %mask) + ret void +} + +; 16-lane contiguous load/stores. + +define void @test_masked_ldst_sv16i8(%sv16i8 * %base, %sv16i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl] +; CHECK: st1b { z[[DATA]].b }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv16i8, %sv16i8* %base, i64 6 + %data = call %sv16i8 @llvm.masked.load.nxv16i8(%sv16i8* %base_load, i32 1, %sv16i1 %mask, %sv16i8 undef) + %base_store = getelementptr %sv16i8, %sv16i8 * %base, i64 7 + call void @llvm.masked.store.nxv16i8(%sv16i8 %data, %sv16i8* %base_store, i32 1, %sv16i1 %mask) + ret void +} Index: llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg+reg.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg+reg.ll @@ -0,0 +1,484 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +%sv2i1 = type +%sv2i8 = type +%sv2i16 = type +%sv2i32 = type +%sv2i64 = type +%sv2f16 = type +%sv2f32 = type +%sv2f64 = type + +%sv4i1 = type +%sv4i8 = type +%sv4i16 = type +%sv4i32 = type +%sv4f16 = type +%sv4f32 = type + +%sv8i1 = type +%sv8i8 = type +%sv8i16 = type +%sv8f16 = type + +%sv16i1 = type +%sv16i8 = type + +; 2-element contiguous loads. +declare %sv2i8 @llvm.masked.load.nxv2i8 (%sv2i8* , i32, %sv2i1, %sv2i8 ) +declare %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16*, i32, %sv2i1, %sv2i16) +declare %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32*, i32, %sv2i1, %sv2i32) +declare %sv2i64 @llvm.masked.load.nxv2i64(%sv2i64*, i32, %sv2i1, %sv2i64) +declare %sv2f16 @llvm.masked.load.nxv2f16(%sv2f16*, i32, %sv2i1, %sv2f16) +declare %sv2f32 @llvm.masked.load.nxv2f32(%sv2f32*, i32, %sv2i1, %sv2f32) +declare %sv2f64 @llvm.masked.load.nxv2f64(%sv2f64*, i32, %sv2i1, %sv2f64) + +; 4-element contiguous loads. +declare %sv4i8 @llvm.masked.load.nxv4i8 (%sv4i8* , i32, %sv4i1, %sv4i8 ) +declare %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16*, i32, %sv4i1, %sv4i16) +declare %sv4i32 @llvm.masked.load.nxv4i32(%sv4i32*, i32, %sv4i1, %sv4i32) +declare %sv4f16 @llvm.masked.load.nxv4f16(%sv4f16*, i32, %sv4i1, %sv4f16) +declare %sv4f32 @llvm.masked.load.nxv4f32(%sv4f32*, i32, %sv4i1, %sv4f32) + +; 8-element contiguous loads. +declare %sv8i8 @llvm.masked.load.nxv8i8 (%sv8i8* , i32, %sv8i1, %sv8i8 ) +declare %sv8i16 @llvm.masked.load.nxv8i16(%sv8i16*, i32, %sv8i1, %sv8i16) +declare %sv8f16 @llvm.masked.load.nxv8f16(%sv8f16*, i32, %sv8i1, %sv8f16) + +; 16-element contiguous loads. +declare %sv16i8 @llvm.masked.load.nxv16i8(%sv16i8*, i32, %sv16i1, %sv16i8) + +; 2-element contiguous stores. +declare void @llvm.masked.store.nxv2i8 (%sv2i8 , %sv2i8* , i32, %sv2i1) +declare void @llvm.masked.store.nxv2i16(%sv2i16, %sv2i16*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2i32(%sv2i32, %sv2i32*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2i64(%sv2i64, %sv2i64*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2f16(%sv2f16, %sv2f16*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2f32(%sv2f32, %sv2f32*, i32, %sv2i1) +declare void @llvm.masked.store.nxv2f64(%sv2f64, %sv2f64*, i32, %sv2i1) + +; 4-element contiguous stores. +declare void @llvm.masked.store.nxv4i8 (%sv4i8 , %sv4i8* , i32, %sv4i1) +declare void @llvm.masked.store.nxv4i16(%sv4i16, %sv4i16*, i32, %sv4i1) +declare void @llvm.masked.store.nxv4i32(%sv4i32, %sv4i32*, i32, %sv4i1) +declare void @llvm.masked.store.nxv4f16(%sv4f16, %sv4f16*, i32, %sv4i1) +declare void @llvm.masked.store.nxv4f32(%sv4f32, %sv4f32*, i32, %sv4i1) + +; 8-element contiguous stores. +declare void @llvm.masked.store.nxv8i8 (%sv8i8 , %sv8i8* , i32, %sv8i1) +declare void @llvm.masked.store.nxv8i16(%sv8i16, %sv8i16*, i32, %sv8i1) +declare void @llvm.masked.store.nxv8f16(%sv8f16, %sv8f16*, i32, %sv8i1) + +; 16-element contiguous stores. +declare void @llvm.masked.store.nxv16i8(%sv16i8, %sv16i8*, i32, %sv16i1) + +; 2-lane contiguous load/stores + +define void @test_masked_ldst_sv2i8(i8 * %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv2i8: +; CHECK: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1] +; CHECK: st1b { z[[DATA]].d }, p0, [x0, x1] +; CHECK: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv2i8* + %data = call %sv2i8 @llvm.masked.load.nxv2i8(%sv2i8* %base_addr, i32 1, %sv2i1 %mask, %sv2i8 undef) + call void @llvm.masked.store.nxv2i8(%sv2i8 %data, %sv2i8* %base_addr, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2i16(i16 * %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv2i16: +; CHECK: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] +; CHECK: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] +; CHECK: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to %sv2i16* + %data = call %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16* %base_addr, i32 1, %sv2i1 %mask, %sv2i16 undef) + call void @llvm.masked.store.nxv2i16(%sv2i16 %data, %sv2i16* %base_addr, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2i32(i32 * %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv2i32: +; CHECK: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK: st1w { z0.d }, p0, [x0, x1, lsl #2] +; CHECK: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to %sv2i32* + %data = call %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32* %base_addr, i32 1, %sv2i1 %mask, %sv2i32 undef) + call void @llvm.masked.store.nxv2i32(%sv2i32 %data, %sv2i32* %base_addr, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2i64(i64 * %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK: st1d { z0.d }, p0, [x0, x1, lsl #3] +; CHECK: ret + %base_i64 = getelementptr i64, i64* %base, i64 %offset + %base_addr = bitcast i64* %base_i64 to %sv2i64* + %data = call %sv2i64 @llvm.masked.load.nxv2i64(%sv2i64* %base_addr, i32 1, %sv2i1 %mask, %sv2i64 undef) + call void @llvm.masked.store.nxv2i64(%sv2i64 %data, %sv2i64* %base_addr, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2f16(half * %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv2f16: +; CHECK: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] +; CHECK: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] +; CHECK: ret + %base_half = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_half to %sv2f16* + %data = call %sv2f16 @llvm.masked.load.nxv2f16(%sv2f16* %base_addr, i32 1, %sv2i1 %mask, %sv2f16 undef) + call void @llvm.masked.store.nxv2f16(%sv2f16 %data, %sv2f16* %base_addr, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2f32(float * %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv2f32: +; CHECK: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2] +; CHECK: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2] +; CHECK: ret + %base_float = getelementptr float, float* %base, i64 %offset + %base_addr = bitcast float* %base_float to %sv2f32* + %data = call %sv2f32 @llvm.masked.load.nxv2f32(%sv2f32* %base_addr, i32 1, %sv2i1 %mask, %sv2f32 undef) + call void @llvm.masked.store.nxv2f32(%sv2f32 %data, %sv2f32* %base_addr, i32 1, %sv2i1 %mask) + ret void +} + +define void @test_masked_ldst_sv2f64(double * %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] +; CHECK: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] +; CHECK: ret + %base_double = getelementptr double, double* %base, i64 %offset + %base_addr = bitcast double* %base_double to %sv2f64* + %data = call %sv2f64 @llvm.masked.load.nxv2f64(%sv2f64* %base_addr, i32 1, %sv2i1 %mask, %sv2f64 undef) + call void @llvm.masked.store.nxv2f64(%sv2f64 %data, %sv2f64* %base_addr, i32 1, %sv2i1 %mask) + ret void +} + +; 2-lane zero/sign extended contiguous loads. + +define %sv2i64 @masked_zload_sv2i8_to_sv2i64(i8* %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: +; CHECK: ld1b { z0.d }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv2i8* + %load = call %sv2i8 @llvm.masked.load.nxv2i8(%sv2i8* %base_addr, i32 1, %sv2i1 %mask, %sv2i8 undef) + %ext = zext %sv2i8 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_sload_sv2i8_to_sv2i64(i8* %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: +; CHECK: ld1sb { z0.d }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv2i8* + %load = call %sv2i8 @llvm.masked.load.nxv2i8(%sv2i8* %base_addr, i32 1, %sv2i1 %mask, %sv2i8 undef) + %ext = sext %sv2i8 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_zload_sv2i16_to_sv2i64(i16* %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: +; CHECK: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to %sv2i16* + %load = call %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16* %base_addr, i32 1, %sv2i1 %mask, %sv2i16 undef) + %ext = zext %sv2i16 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_sload_sv2i16_to_sv2i64(i16* %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: +; CHECK: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to %sv2i16* + %load = call %sv2i16 @llvm.masked.load.nxv2i16(%sv2i16* %base_addr, i32 1, %sv2i1 %mask, %sv2i16 undef) + %ext = sext %sv2i16 %load to %sv2i64 + ret %sv2i64 %ext +} + + +define %sv2i64 @masked_zload_sv2i32_to_sv2i64(i32* %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: +; CHECK: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to %sv2i32* + %load = call %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32* %base_addr, i32 1, %sv2i1 %mask, %sv2i32 undef) + %ext = zext %sv2i32 %load to %sv2i64 + ret %sv2i64 %ext +} + +define %sv2i64 @masked_sload_sv2i32_to_sv2i64(i32* %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: +; CHECK: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to %sv2i32* + %load = call %sv2i32 @llvm.masked.load.nxv2i32(%sv2i32* %base_addr, i32 1, %sv2i1 %mask, %sv2i32 undef) + %ext = sext %sv2i32 %load to %sv2i64 + ret %sv2i64 %ext +} + +; 2-lane truncating contiguous stores. + +define void @masked_trunc_store_sv2i64_to_sv2i8(%sv2i64 %val, i8 *%base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: +; CHECK: st1b { z0.d }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv2i8* + %trunc = trunc %sv2i64 %val to %sv2i8 + call void @llvm.masked.store.nxv2i8(%sv2i8 %trunc, %sv2i8 *%base_addr, i32 8, %sv2i1 %mask) + ret void +} + +define void @masked_trunc_store_sv2i64_to_sv2i16(%sv2i64 %val, i16 *%base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: +; CHECK: st1h { z0.d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to %sv2i16* + %trunc = trunc %sv2i64 %val to %sv2i16 + call void @llvm.masked.store.nxv2i16(%sv2i16 %trunc, %sv2i16 *%base_addr, i32 8, %sv2i1 %mask) + ret void +} + +define void @masked_trunc_store_sv2i64_to_sv2i32(%sv2i64 %val, i32 *%base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: +; CHECK: st1w { z0.d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to %sv2i32* + %trunc = trunc %sv2i64 %val to %sv2i32 + call void @llvm.masked.store.nxv2i32(%sv2i32 %trunc, %sv2i32 *%base_addr, i32 8, %sv2i1 %mask) + ret void +} + +; 4-lane contiguous load/stores. + +define void @test_masked_ldst_sv4i8(i8 * %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv4i8: +; CHECK: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1] +; CHECK: st1b { z[[DATA]].s }, p0, [x0, x1] +; CHECK: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv4i8* + %data = call %sv4i8 @llvm.masked.load.nxv4i8(%sv4i8* %base_addr, i32 1, %sv4i1 %mask, %sv4i8 undef) + call void @llvm.masked.store.nxv4i8(%sv4i8 %data, %sv4i8* %base_addr, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4i16(i16 * %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv4i16: +; CHECK: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] +; CHECK: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] +; CHECK: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to %sv4i16* + %data = call %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16* %base_addr, i32 1, %sv4i1 %mask, %sv4i16 undef) + call void @llvm.masked.store.nxv4i16(%sv4i16 %data, %sv4i16* %base_addr, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4i32(i32 * %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to %sv4i32* + %data = call %sv4i32 @llvm.masked.load.nxv4i32(%sv4i32* %base_addr, i32 1, %sv4i1 %mask, %sv4i32 undef) + call void @llvm.masked.store.nxv4i32(%sv4i32 %data, %sv4i32* %base_addr, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4f16(half * %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv4f16: +; CHECK: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] +; CHECK: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] +; CHECK: ret + %base_f16 = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_f16 to %sv4f16* + %data = call %sv4f16 @llvm.masked.load.nxv4f16(%sv4f16* %base_addr, i32 1, %sv4i1 %mask, %sv4f16 undef) + call void @llvm.masked.store.nxv4f16(%sv4f16 %data, %sv4f16* %base_addr, i32 1, %sv4i1 %mask) + ret void +} + +define void @test_masked_ldst_sv4f32(float * %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK: ret + %base_f32 = getelementptr float, float* %base, i64 %offset + %base_addr = bitcast float* %base_f32 to %sv4f32* + %data = call %sv4f32 @llvm.masked.load.nxv4f32(%sv4f32* %base_addr, i32 1, %sv4i1 %mask, %sv4f32 undef) + call void @llvm.masked.store.nxv4f32(%sv4f32 %data, %sv4f32* %base_addr, i32 1, %sv4i1 %mask) + ret void +} + +; 4-lane zero/sign extended contiguous loads. + +define %sv4i32 @masked_zload_sv4i8_to_sv4i32(i8* %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: +; CHECK: ld1b { z0.s }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv4i8* + %load = call %sv4i8 @llvm.masked.load.nxv4i8(%sv4i8* %base_addr, i32 1, %sv4i1 %mask, %sv4i8 undef) + %ext = zext %sv4i8 %load to %sv4i32 + ret %sv4i32 %ext +} + +define %sv4i32 @masked_sload_sv4i8_to_sv4i32(i8* %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: +; CHECK: ld1sb { z0.s }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv4i8* + %load = call %sv4i8 @llvm.masked.load.nxv4i8(%sv4i8* %base_addr, i32 1, %sv4i1 %mask, %sv4i8 undef) + %ext = sext %sv4i8 %load to %sv4i32 + ret %sv4i32 %ext +} + +define %sv4i32 @masked_zload_sv4i16_to_sv4i32(i16* %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: +; CHECK: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to %sv4i16* + %load = call %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16* %base_addr, i32 1, %sv4i1 %mask, %sv4i16 undef) + %ext = zext %sv4i16 %load to %sv4i32 + ret %sv4i32 %ext +} + +define %sv4i32 @masked_sload_sv4i16_to_sv4i32(i16* %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: +; CHECK: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to %sv4i16* + %load = call %sv4i16 @llvm.masked.load.nxv4i16(%sv4i16* %base_addr, i32 1, %sv4i1 %mask, %sv4i16 undef) + %ext = sext %sv4i16 %load to %sv4i32 + ret %sv4i32 %ext +} + +; 4-lane truncating contiguous stores. + +define void @masked_trunc_store_sv4i32_to_sv4i8(%sv4i32 %val, i8 *%base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: +; CHECK: st1b { z0.s }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv4i8* + %trunc = trunc %sv4i32 %val to %sv4i8 + call void @llvm.masked.store.nxv4i8(%sv4i8 %trunc, %sv4i8 *%base_addr, i32 8, %sv4i1 %mask) + ret void +} + +define void @masked_trunc_store_sv4i32_to_sv4i16(%sv4i32 %val, i16 *%base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: +; CHECK: st1h { z0.s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to %sv4i16* + %trunc = trunc %sv4i32 %val to %sv4i16 + call void @llvm.masked.store.nxv4i16(%sv4i16 %trunc, %sv4i16 *%base_addr, i32 8, %sv4i1 %mask) + ret void +} + +; 8-lane contiguous load/stores. + +define void @test_masked_ldst_sv8i8(i8 * %base, %sv8i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv8i8: +; CHECK: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1] +; CHECK: st1b { z[[DATA]].h }, p0, [x0, x1] +; CHECK: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv8i8* + %data = call %sv8i8 @llvm.masked.load.nxv8i8(%sv8i8* %base_addr, i32 1, %sv8i1 %mask, %sv8i8 undef) + call void @llvm.masked.store.nxv8i8(%sv8i8 %data, %sv8i8* %base_addr, i32 1, %sv8i1 %mask) + ret void +} + +define void @test_masked_ldst_sv8i16(i16 * %base, %sv8i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to %sv8i16* + %data = call %sv8i16 @llvm.masked.load.nxv8i16(%sv8i16* %base_addr, i32 1, %sv8i1 %mask, %sv8i16 undef) + call void @llvm.masked.store.nxv8i16(%sv8i16 %data, %sv8i16* %base_addr, i32 1, %sv8i1 %mask) + ret void +} + +define void @test_masked_ldst_sv8f16(half * %base, %sv8i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK: ret + %base_f16 = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_f16 to %sv8f16* + %data = call %sv8f16 @llvm.masked.load.nxv8f16(%sv8f16* %base_addr, i32 1, %sv8i1 %mask, %sv8f16 undef) + call void @llvm.masked.store.nxv8f16(%sv8f16 %data, %sv8f16* %base_addr, i32 1, %sv8i1 %mask) + ret void +} + +; 8-lane zero/sign extended contiguous loads. + +define %sv8i16 @masked_zload_sv8i8_to_sv8i16(i8* %base, %sv8i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: +; CHECK: ld1b { z0.h }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv8i8* + %load = call %sv8i8 @llvm.masked.load.nxv8i8(%sv8i8* %base_addr, i32 1, %sv8i1 %mask, %sv8i8 undef) + %ext = zext %sv8i8 %load to %sv8i16 + ret %sv8i16 %ext +} + +define %sv8i16 @masked_sload_sv8i8_to_sv8i16(i8* %base, %sv8i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: +; CHECK: ld1sb { z0.h }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv8i8* + %load = call %sv8i8 @llvm.masked.load.nxv8i8(%sv8i8* %base_addr, i32 1, %sv8i1 %mask, %sv8i8 undef) + %ext = sext %sv8i8 %load to %sv8i16 + ret %sv8i16 %ext +} + +; 8-lane truncating contiguous stores. + +define void @masked_trunc_store_sv8i16_to_sv8i8(%sv8i16 %val, i8 *%base, %sv8i1 %mask, i64 %offset) { +; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: +; C HECK: st1b { z0.h }, p0, [x0, x1] +; C HECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv8i8* + %trunc = trunc %sv8i16 %val to %sv8i8 + call void @llvm.masked.store.nxv8i8(%sv8i8 %trunc, %sv8i8 *%base_addr, i32 8, %sv8i1 %mask) + ret void +} + +; 16-lane contiguous load/stores. + +define void @test_masked_ldst_sv16i8(i8 * %base, %sv16i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1] +; CHECK: st1b { z[[DATA]].b }, p0, [x0, x1] +; CHECK: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv16i8* + %data = call %sv16i8 @llvm.masked.load.nxv16i8(%sv16i8* %base_addr, i32 1, %sv16i1 %mask, %sv16i8 undef) + call void @llvm.masked.store.nxv16i8(%sv16i8 %data, %sv16i8* %base_addr, i32 1, %sv16i1 %mask) + ret void +} Index: llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg+imm.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg+imm.ll @@ -0,0 +1,157 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +%sv2i1 = type +%sv2i64 = type +%sv2f64 = type + +%sv4i1 = type +%sv4i32 = type +%sv4f32 = type + +%sv8i1 = type +%sv8i16 = type +%sv8f16 = type + +%sv16i1 = type +%sv16i8 = type + +; 2-element non-temporal loads. +declare %sv2i64 @llvm.aarch64.sve.ldnt1.nxv2i64(%sv2i1, %sv2i64*) +declare %sv2f64 @llvm.aarch64.sve.ldnt1.nxv2f64(%sv2i1, %sv2f64*) + +; 4-element non-temporal loads. +declare %sv4i32 @llvm.aarch64.sve.ldnt1.nxv4i32(%sv4i1, %sv4i32*) +declare %sv4f32 @llvm.aarch64.sve.ldnt1.nxv4f32(%sv4i1, %sv4f32*) + +; 8-element non-temporal loads. +declare %sv8i16 @llvm.aarch64.sve.ldnt1.nxv8i16(%sv8i1, %sv8i16*) +declare %sv8f16 @llvm.aarch64.sve.ldnt1.nxv8f16(%sv8i1, %sv8f16*) + +; 16-element non-temporal loads. +declare %sv16i8 @llvm.aarch64.sve.ldnt1.nxv16i8(%sv16i1, %sv16i8*) + +; 2-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv2i64(%sv2i64, %sv2i1, %sv2i64*) +declare void @llvm.aarch64.sve.stnt1.nxv2f64(%sv2f64, %sv2i1, %sv2f64*) + +; 4-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv4i32(%sv4i32, %sv4i1, %sv4i32*) +declare void @llvm.aarch64.sve.stnt1.nxv4f32(%sv4f32, %sv4i1, %sv4f32*) + +; 8-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv8i16(%sv8i16, %sv8i1, %sv8i16*) +declare void @llvm.aarch64.sve.stnt1.nxv8f16(%sv8f16, %sv8i1, %sv8f16*) + +; 16-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv16i8(%sv16i8, %sv16i1, %sv16i8*) + +; Range checks: for all the instruction tested in this file, the +; immediate must be within the range [-8, 7] (4-bit immediate). Out of +; range values are tested only in one case (following). Valid values +; are tested all through the rest of the file. + +define void @imm_out_of_range(%sv2i64 * %base, %sv2i1 %mask) { +; CHECK-LABEL: imm_out_of_range: +; CHECK: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}] +; CHECK: stnt1d { z[[DATA]].d }, p0, [x{{[0-9]+}}] +; CHECK: ret + %base_load = getelementptr %sv2i64, %sv2i64* %base, i64 8 + %data = call %sv2i64 @llvm.aarch64.sve.ldnt1.nxv2i64(%sv2i1 %mask, %sv2i64* %base_load) + %base_store = getelementptr %sv2i64, %sv2i64 * %base, i64 -9 + call void @llvm.aarch64.sve.stnt1.nxv2i64(%sv2i64 %data, %sv2i1 %mask, %sv2i64* %base_store) + ret void +} + +; 2-lane non-temporal load/stores + + +define void @test_masked_ldst_sv2i64(%sv2i64 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK: stnt1d { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2i64, %sv2i64* %base, i64 -8 + %data = call %sv2i64 @llvm.aarch64.sve.ldnt1.nxv2i64(%sv2i1 %mask, %sv2i64* %base_load) + %base_store = getelementptr %sv2i64, %sv2i64 * %base, i64 -7 + call void @llvm.aarch64.sve.stnt1.nxv2i64(%sv2i64 %data, %sv2i1 %mask, %sv2i64* %base_store) + ret void +} + +define void @test_masked_ldst_sv2f64(%sv2f64 * %base, %sv2i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl] +; CHECK: stnt1d { z[[DATA]].d }, p0, [x0, #-5, mul vl] +; CHECK: ret + %base_load = getelementptr %sv2f64, %sv2f64* %base, i64 -6 + %data = call %sv2f64 @llvm.aarch64.sve.ldnt1.nxv2f64(%sv2i1 %mask,%sv2f64* %base_load) + %base_store = getelementptr %sv2f64, %sv2f64 * %base, i64 -5 + call void @llvm.aarch64.sve.stnt1.nxv2f64(%sv2f64 %data, %sv2i1 %mask, %sv2f64* %base_store) + ret void +} + +; 4-lane non-temporal load/stores. + +define void @test_masked_ldst_sv4i32(%sv4i32 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl] +; CHECK: stnt1w { z[[DATA]].s }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4i32, %sv4i32* %base, i64 6 + %data = call %sv4i32 @llvm.aarch64.sve.ldnt1.nxv4i32(%sv4i1 %mask, %sv4i32* %base_load) + %base_store = getelementptr %sv4i32, %sv4i32 * %base, i64 7 + call void @llvm.aarch64.sve.stnt1.nxv4i32(%sv4i32 %data, %sv4i1 %mask, %sv4i32* %base_store) + ret void +} + +define void @test_masked_ldst_sv4f32(%sv4f32 * %base, %sv4i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK: stnt1w { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv4f32, %sv4f32* %base, i64 -1 + %data = call %sv4f32 @llvm.aarch64.sve.ldnt1.nxv4f32(%sv4i1 %mask, %sv4f32* %base_load) + %base_store = getelementptr %sv4f32, %sv4f32 * %base, i64 2 + call void @llvm.aarch64.sve.stnt1.nxv4f32(%sv4f32 %data, %sv4i1 %mask, %sv4f32* %base_store) + ret void +} + + +; 8-lane non-temporal load/stores. + +define void @test_masked_ldst_sv8i16(%sv8i16 * %base, %sv8i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK: stnt1h { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv8i16, %sv8i16* %base, i64 6 + %data = call %sv8i16 @llvm.aarch64.sve.ldnt1.nxv8i16(%sv8i1 %mask, %sv8i16* %base_load) + %base_store = getelementptr %sv8i16, %sv8i16 * %base, i64 7 + call void @llvm.aarch64.sve.stnt1.nxv8i16(%sv8i16 %data, %sv8i1 %mask, %sv8i16* %base_store) + ret void +} + +define void @test_masked_ldst_sv8f16(%sv8f16 * %base, %sv8i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] +; CHECK: stnt1h { z[[DATA]].h }, p0, [x0, #2, mul vl] +; CHECK: ret + %base_load = getelementptr %sv8f16, %sv8f16* %base, i64 -1 + %data = call %sv8f16 @llvm.aarch64.sve.ldnt1.nxv8f16(%sv8i1 %mask, %sv8f16* %base_load) + %base_store = getelementptr %sv8f16, %sv8f16 * %base, i64 2 + call void @llvm.aarch64.sve.stnt1.nxv8f16(%sv8f16 %data, %sv8i1 %mask, %sv8f16* %base_store) + ret void +} + +; 16-lane non-temporal load/stores. + +define void @test_masked_ldst_sv16i8(%sv16i8 * %base, %sv16i1 %mask) { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl] +; CHECK: stnt1b { z[[DATA]].b }, p0, [x0, #7, mul vl] +; CHECK: ret + %base_load = getelementptr %sv16i8, %sv16i8* %base, i64 6 + %data = call %sv16i8 @llvm.aarch64.sve.ldnt1.nxv16i8(%sv16i1 %mask, %sv16i8* %base_load) + %base_store = getelementptr %sv16i8, %sv16i8 * %base, i64 7 + call void @llvm.aarch64.sve.stnt1.nxv16i8(%sv16i8 %data, %sv16i1 %mask, %sv16i8* %base_store) + ret void +} Index: llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg+reg.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg+reg.ll @@ -0,0 +1,139 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +%sv2i1 = type +%sv2i64 = type +%sv2f64 = type + +%sv4i1 = type +%sv4i32 = type +%sv4f32 = type + +%sv8i1 = type +%sv8i16 = type +%sv8f16 = type + +%sv16i1 = type +%sv16i8 = type + +; 2-element contiguous loads. +declare %sv2i64 @llvm.aarch64.sve.ldnt1.nxv2i64(%sv2i1, %sv2i64*) +declare %sv2f64 @llvm.aarch64.sve.ldnt1.nxv2f64(%sv2i1, %sv2f64*) + +; 4-element contiguous loads. +declare %sv4i32 @llvm.aarch64.sve.ldnt1.nxv4i32(%sv4i1, %sv4i32*) +declare %sv4f32 @llvm.aarch64.sve.ldnt1.nxv4f32(%sv4i1, %sv4f32*) + +; 8-element contiguous loads. +declare %sv8i16 @llvm.aarch64.sve.ldnt1.nxv8i16(%sv8i1, %sv8i16*) +declare %sv8f16 @llvm.aarch64.sve.ldnt1.nxv8f16(%sv8i1, %sv8f16*) + +; 16-element contiguous loads. +declare %sv16i8 @llvm.aarch64.sve.ldnt1.nxv16i8(%sv16i1, %sv16i8*) + +; 2-element contiguous stores. +declare void @llvm.aarch64.sve.stnt1.nxv2i64(%sv2i64, %sv2i1, %sv2i64*) +declare void @llvm.aarch64.sve.stnt1.nxv2f64(%sv2f64, %sv2i1, %sv2f64*) + +; 4-element contiguous stores. +declare void @llvm.aarch64.sve.stnt1.nxv4i32(%sv4i32, %sv4i1, %sv4i32*) +declare void @llvm.aarch64.sve.stnt1.nxv4f32(%sv4f32, %sv4i1, %sv4f32*) + +; 8-element contiguous stores. +declare void @llvm.aarch64.sve.stnt1.nxv8i16(%sv8i16, %sv8i1, %sv8i16*) +declare void @llvm.aarch64.sve.stnt1.nxv8f16(%sv8f16, %sv8i1, %sv8f16*) + +; 16-element contiguous stores. +declare void @llvm.aarch64.sve.stnt1.nxv16i8(%sv16i8, %sv16i1, %sv16i8*) + +; 2-lane contiguous load/stores + +define void @test_masked_ldst_sv2i64(i64* %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] +; CHECK: stnt1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] +; CHECK: ret + %base_i64 = getelementptr i64, i64* %base, i64 %offset + %base_addr = bitcast i64* %base_i64 to %sv2i64* + %data = call %sv2i64 @llvm.aarch64.sve.ldnt1.nxv2i64(%sv2i1 %mask, %sv2i64* %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv2i64(%sv2i64 %data, %sv2i1 %mask, %sv2i64* %base_addr) + ret void +} + +define void @test_masked_ldst_sv2f64(double* %base, %sv2i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] +; CHECK: stnt1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] +; CHECK: ret + %base_double = getelementptr double, double* %base, i64 %offset + %base_addr = bitcast double* %base_double to %sv2f64* + %data = call %sv2f64 @llvm.aarch64.sve.ldnt1.nxv2f64(%sv2i1 %mask,%sv2f64* %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv2f64(%sv2f64 %data, %sv2i1 %mask, %sv2f64* %base_addr) + ret void +} + +; 4-lane contiguous load/stores. + +define void @test_masked_ldst_sv4i32(i32* %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK: stnt1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to %sv4i32* + %data = call %sv4i32 @llvm.aarch64.sve.ldnt1.nxv4i32(%sv4i1 %mask, %sv4i32* %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv4i32(%sv4i32 %data, %sv4i1 %mask, %sv4i32* %base_addr) + ret void +} + +define void @test_masked_ldst_sv4f32(float* %base, %sv4i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK: stnt1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK: ret + %base_float = getelementptr float, float* %base, i64 %offset + %base_addr = bitcast float* %base_float to %sv4f32* + %data = call %sv4f32 @llvm.aarch64.sve.ldnt1.nxv4f32(%sv4i1 %mask, %sv4f32* %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv4f32(%sv4f32 %data, %sv4i1 %mask, %sv4f32* %base_addr) + ret void +} + + +; 8-lane contiguous load/stores. + +define void @test_masked_ldst_sv8i16(i16* %base, %sv8i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to %sv8i16* + %data = call %sv8i16 @llvm.aarch64.sve.ldnt1.nxv8i16(%sv8i1 %mask, %sv8i16* %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv8i16(%sv8i16 %data, %sv8i1 %mask, %sv8i16* %base_addr) + ret void +} + +define void @test_masked_ldst_sv8f16(half* %base, %sv8i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK: ret + %base_half = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_half to %sv8f16* + %data = call %sv8f16 @llvm.aarch64.sve.ldnt1.nxv8f16(%sv8i1 %mask, %sv8f16* %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv8f16(%sv8f16 %data, %sv8i1 %mask, %sv8f16* %base_addr) + ret void +} + +; 16-lane contiguous load/stores. + +define void @test_masked_ldst_sv16i8(i8* %base, %sv16i1 %mask, i64 %offset) { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1] +; CHECK: stnt1b { z[[DATA]].b }, p0, [x0, x1] +; CHECK: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to %sv16i8* + %data = call %sv16i8 @llvm.aarch64.sve.ldnt1.nxv16i8(%sv16i1 %mask, %sv16i8* %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv16i8(%sv16i8 %data, %sv16i1 %mask, %sv16i8* %base_addr) + ret void +}