diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2327,6 +2327,13 @@ DAG.haveNoCommonBitsSet(N0, N1)) return DAG.getNode(ISD::OR, DL, VT, N0, N1); + // Fold (add (vscale * C0), (vscale * C1)) to (vscale * C0 + C1)) + if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) { + APInt C0 = N0->getConstantOperandAPInt(0); + APInt C1 = N1->getConstantOperandAPInt(0); + return DAG.getVScale(DL, VT, C0 + C1); + } + return SDValue(); } @@ -3250,6 +3257,12 @@ } } + // canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C)) + if (N1.getOpcode() == ISD::VSCALE) { + APInt IntVal = N1.getConstantOperandAPInt(0); + return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal)); + } + // Prefer an add for more folding potential and possibly better codegen: // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1) if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) { @@ -3585,6 +3598,14 @@ DAG.getNode(ISD::MUL, SDLoc(N1), VT, N0.getOperand(1), N1)); + // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1))) + if (N0.getOpcode() == ISD::VSCALE) + if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) { + APInt C0 = N0.getConstantOperandAPInt(0); + APInt C1 = NC1->getAPIntValue(); + return DAG.getVScale(SDLoc(N), VT, C0 * C1); + } + // reassociate mul if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags())) return RMUL; @@ -7706,6 +7727,15 @@ if (SDValue NewSHL = visitShiftByConstant(N)) return NewSHL; + // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1))) + if (N0.getOpcode() == ISD::VSCALE) + if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) { + auto DL = SDLoc(N); + APInt C0 = N0.getConstantOperandAPInt(0); + APInt C1 = NC1->getAPIntValue(); + return DAG.getVScale(DL, VT, C0 << C1); + } + return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -221,6 +221,14 @@ void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); + template + bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base, + SDValue &OffImm); + /// SVE Reg+Reg address mode + template + bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) { + return SelectSVERegRegAddrMode(N, Scale, Base, Offset); + } void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); @@ -280,6 +288,8 @@ bool SelectSVESignedArithImm(SDValue N, SDValue &Imm); bool SelectSVEArithImm(SDValue N, SDValue &Imm); + bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, + SDValue &Offset); }; } // end anonymous namespace @@ -687,6 +697,7 @@ // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N. template bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) { + N.dump(); if (!isa(N)) return false; @@ -4399,3 +4410,73 @@ CodeGenOpt::Level OptLevel) { return new AArch64DAGToDAGISel(TM, OptLevel); } + +/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode: +/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max +/// where Root is the memory access using N for its address. +template +bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, + SDValue &Base, + SDValue &OffImm) { + assert(isa(Root) && "Invalid node."); + + EVT MemVT = cast(Root)->getMemoryVT(); + + if (N.getOpcode() != ISD::ADD) + return false; + + SDValue VScale = N.getOperand(1); + if (VScale.getOpcode() != ISD::VSCALE) + return false; + + TypeSize TS = MemVT.getSizeInBits(); + unsigned MemWidthBytes = TS.getKnownMinSize() / 8; + int64_t MulImm = cast(VScale.getOperand(0))->getSExtValue(); + + if ((MulImm % MemWidthBytes) == 0) { + signed Offset = MulImm / MemWidthBytes; + if ((Offset >= Min) && (Offset <= Max)) { + Base = N.getOperand(0); + OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64); + return true; + } + } + + return false; +} + +/// Select register plus register addressing mode for SVE, with scaled +/// offset. +bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale, + SDValue &Base, + SDValue &Offset) { + if (N.getOpcode() != ISD::ADD) + return false; + + // Process an ADD node. + const SDValue LHS = N.getOperand(0); + const SDValue RHS = N.getOperand(1); + + // 8 bit data does not come with the SHL node, so it is treated + // separately. + if (Scale == 0) { + Base = LHS; + Offset = RHS; + return true; + } + + // Check if the RHS is a shift node with a constant. + if (RHS.getOpcode() == ISD::SHL) { + const SDValue SRHS = RHS.getOperand(1); + if (auto C = dyn_cast(SRHS)) { + const uint64_t Shift = C->getZExtValue(); + if (Shift == Scale) { + Base = LHS; + Offset = RHS.getOperand(0); + return true; + } + } + } + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1206,81 +1206,100 @@ // Add more complex addressing modes here as required multiclass pred_load { - + Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))), (RegImmInst PPR:$gp, GPR64:$base, (i64 0))>; } // 2-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 4-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 8-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 16-element contiguous loads - defm : pred_load; + defm : pred_load; multiclass pred_store { + Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> { + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)), + (RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)), (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; } // 2-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 4-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 8-element contiguous stores - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; // 16-element contiguous stores - defm : pred_store; + defm : pred_store; - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; - defm : pred_store; - defm : pred_store; - defm : pred_store; - defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; multiclass unpred_store { def _fi : Pat<(store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -6987,3 +6987,11 @@ let Constraints = "$Zdn = $_Zdn"; } + +/// Addressing modes +def am_sve_indexed_s4 :ComplexPattern", [], [SDNPWantRoot]>; + +def am_sve_regreg_lsl0 : ComplexPattern", []>; +def am_sve_regreg_lsl1 : ComplexPattern", []>; +def am_sve_regreg_lsl2 : ComplexPattern", []>; +def am_sve_regreg_lsl3 : ComplexPattern", []>; diff --git a/llvm/test/CodeGen/AArch64/sve-gep.ll b/llvm/test/CodeGen/AArch64/sve-gep.ll --- a/llvm/test/CodeGen/AArch64/sve-gep.ll +++ b/llvm/test/CodeGen/AArch64/sve-gep.ll @@ -4,8 +4,8 @@ define * @scalar_of_scalable_1(* %base) { ; CHECK-LABEL: scalar_of_scalable_1: ; CHECK: // %bb.0: -; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: add x0, x0, x8, lsl #2 +; CHECK-NEXT: rdvl x8, #4 +; CHECK-NEXT: add x0, x0, x8 ; CHECK-NEXT: ret %d = getelementptr , * %base, i64 4 ret * %d diff --git a/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll @@ -0,0 +1,466 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; Range checks: for all the instruction tested in this file, the +; immediate must be within the range [-8, 7] (4-bit immediate). Out of +; range values are tested only in one case (following). Valid values +; are tested all through the rest of the file. + +define void @imm_out_of_range( * %base, %mask) nounwind { +; CHECK-LABEL: imm_out_of_range: +; CHECK-NEXT: rdvl x8, #8 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}] +; CHECK-NEXT: rdvl x8, #-9 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x{{[0-9]+}}] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 8 + %data = call @llvm.masked.load.nxv2i64(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 -9 + call void @llvm.masked.store.nxv2i64( %data, * %base_store, i32 1, %mask) + ret void +} + +; 2-lane contiguous load/stores + +define void @test_masked_ldst_sv2i8( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2i8(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2i8( %data, * %base_store, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv2i16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i16: +; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2i16(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2i16( %data, * %base_store, i32 1, %mask) + ret void +} + + +define void @test_masked_ldst_sv2i32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i32: +; CHECK-NEXT: ld1sw { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2i32(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2i32( %data, * %base_store, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv2i64( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2i64(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2i64( %data, * %base_store, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv2f16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2f16(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2f16( %data, * %base_store, i32 1, %mask) + ret void +} + + +define void @test_masked_ldst_sv2f32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.masked.load.nxv2f32(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.masked.store.nxv2f32( %data, * %base_store, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv2f64( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl] +; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, #-5, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -6 + %data = call @llvm.masked.load.nxv2f64(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 -5 + call void @llvm.masked.store.nxv2f64( %data, * %base_store, i32 1, %mask) + ret void +} + +; 2-lane zero/sign extended contiguous loads. + +define @masked_zload_sv2i8_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -4 + %load = call @llvm.masked.load.nxv2i8(* %base_load, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i8_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -3 + %load = call @llvm.masked.load.nxv2i8(* %base_load, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +define @masked_zload_sv2i16_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 1 + %load = call @llvm.masked.load.nxv2i16(* %base_load, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i16_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 2 + %load = call @llvm.masked.load.nxv2i16(* %base_load, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +define @masked_zload_sv2i32_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -2 + %load = call @llvm.masked.load.nxv2i32(* %base_load, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i32_to_sv2i64(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %load = call @llvm.masked.load.nxv2i32(* %base_load, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +; 2-lane truncating contiguous stores. + +define void @masked_trunc_store_sv2i64_to_sv2i8( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: +; CHECK-NEXT: st1b { z0.d }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 3 + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i8( %trunc, *%base_load, i32 8, %mask) + ret void +} + + +define void @masked_trunc_store_sv2i64_to_sv2i16( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, #4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 4 + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i16( %trunc, *%base_load, i32 8, %mask) + ret void +} + +define void @masked_trunc_store_sv2i64_to_sv2i32( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: +; CHECK-NEXT: st1w { z0.d }, p0, [x0, #5, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 5 + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i32( %trunc, *%base_load, i32 8, %mask) + ret void +} + +; 4-lane contiguous load/stores. + +define void @test_masked_ldst_sv4i8( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.masked.load.nxv4i8(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.masked.store.nxv4i8( %data, * %base_store, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv4i16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i16: +; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.masked.load.nxv4i16(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.masked.store.nxv4i16( %data, * %base_store, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv4i32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.masked.load.nxv4i32(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.masked.store.nxv4i32( %data, * %base_store, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv4f16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.masked.load.nxv4f16(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.masked.store.nxv4f16( %data, * %base_store, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv4f32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.masked.load.nxv4f32(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.masked.store.nxv4f32( %data, * %base_store, i32 1, %mask) + ret void +} + +; 4-lane zero/sign extended contiguous loads. + +define @masked_zload_sv4i8_to_sv4i32(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -4 + %load = call @llvm.masked.load.nxv4i8(* %base_load, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv4i8_to_sv4i32(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -3 + %load = call @llvm.masked.load.nxv4i8(* %base_load, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +define @masked_zload_sv4i16_to_sv4i32(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 1 + %load = call @llvm.masked.load.nxv4i16(* %base_load, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv4i16_to_sv4i32(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 2 + %load = call @llvm.masked.load.nxv4i16(* %base_load, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +; 4-lane truncating contiguous stores. + +define void @masked_trunc_store_sv4i32_to_sv4i8( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: +; CHECK-NEXT: st1b { z0.s }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 3 + %trunc = trunc %val to + call void @llvm.masked.store.nxv4i8( %trunc, *%base_load, i32 8, %mask) + ret void +} + + +define void @masked_trunc_store_sv4i32_to_sv4i16( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: +; CHECK-NEXT: st1h { z0.s }, p0, [x0, #4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 4 + %trunc = trunc %val to + call void @llvm.masked.store.nxv4i16( %trunc, *%base_load, i32 8, %mask) + ret void +} + +; 8-lane contiguous load/stores. + +define void @test_masked_ldst_sv8i8( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.masked.load.nxv8i8(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.masked.store.nxv8i8( %data, * %base_store, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv8i16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.masked.load.nxv8i16(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.masked.store.nxv8i16( %data, * %base_store, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv8f16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.masked.load.nxv8f16(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.masked.store.nxv8f16( %data, * %base_store, i32 1, %mask) + ret void +} + +; 8-lane zero/sign extended contiguous loads. + +define @masked_zload_sv8i8_to_sv8i16(* %base, %mask) nounwind { +; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, #-4, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -4 + %load = call @llvm.masked.load.nxv8i8(* %base_load, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv8i8_to_sv8i16(* %base, %mask) nounwind { +; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, #-3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -3 + %load = call @llvm.masked.load.nxv8i8(* %base_load, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +; 8-lane truncating contiguous stores. + +define void @masked_trunc_store_sv8i16_to_sv8i8( %val, *%base, %mask) nounwind { +; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: +; CHECK-NEXT: st1b { z0.h }, p0, [x0, #3, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 3 + %trunc = trunc %val to + call void @llvm.masked.store.nxv8i8( %trunc, *%base_load, i32 8, %mask) + ret void +} + +; 16-lane contiguous load/stores. + +define void @test_masked_ldst_sv16i8( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.masked.load.nxv16i8(* %base_load, i32 1, %mask, undef) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.masked.store.nxv16i8( %data, * %base_store, i32 1, %mask) + ret void +} + +; 2-element contiguous loads. +declare @llvm.masked.load.nxv2i8 (* , i32, , ) +declare @llvm.masked.load.nxv2i16(*, i32, , ) +declare @llvm.masked.load.nxv2i32(*, i32, , ) +declare @llvm.masked.load.nxv2i64(*, i32, , ) +declare @llvm.masked.load.nxv2f16(*, i32, , ) +declare @llvm.masked.load.nxv2f32(*, i32, , ) +declare @llvm.masked.load.nxv2f64(*, i32, , ) + +; 4-element contiguous loads. +declare @llvm.masked.load.nxv4i8 (* , i32, , ) +declare @llvm.masked.load.nxv4i16(*, i32, , ) +declare @llvm.masked.load.nxv4i32(*, i32, , ) +declare @llvm.masked.load.nxv4f16(*, i32, , ) +declare @llvm.masked.load.nxv4f32(*, i32, , ) + +; 8-element contiguous loads. +declare @llvm.masked.load.nxv8i8 (* , i32, , ) +declare @llvm.masked.load.nxv8i16(*, i32, , ) +declare @llvm.masked.load.nxv8f16(*, i32, , ) + +; 16-element contiguous loads. +declare @llvm.masked.load.nxv16i8(*, i32, , ) + +; 2-element contiguous stores. +declare void @llvm.masked.store.nxv2i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv2i16(, *, i32, ) +declare void @llvm.masked.store.nxv2i32(, *, i32, ) +declare void @llvm.masked.store.nxv2i64(, *, i32, ) +declare void @llvm.masked.store.nxv2f16(, *, i32, ) +declare void @llvm.masked.store.nxv2f32(, *, i32, ) +declare void @llvm.masked.store.nxv2f64(, *, i32, ) + +; 4-element contiguous stores. +declare void @llvm.masked.store.nxv4i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv4i16(, *, i32, ) +declare void @llvm.masked.store.nxv4i32(, *, i32, ) +declare void @llvm.masked.store.nxv4f16(, *, i32, ) +declare void @llvm.masked.store.nxv4f32(, *, i32, ) + +; 8-element contiguous stores. +declare void @llvm.masked.store.nxv8i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv8i16(, *, i32, ) +declare void @llvm.masked.store.nxv8f16(, *, i32, ) + +; 16-element contiguous stores. +declare void @llvm.masked.store.nxv16i8(, *, i32, ) diff --git a/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll @@ -0,0 +1,460 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; 2-lane contiguous load/stores + +define void @test_masked_ldst_sv2i8(i8 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1] +; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %data = call @llvm.masked.load.nxv2i8(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv2i8( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv2i16(i16 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i16: +; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %data = call @llvm.masked.load.nxv2i16(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv2i16( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv2i32(i32 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i32: +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %data = call @llvm.masked.load.nxv2i32(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv2i32( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv2i64(i64 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %base_i64 = getelementptr i64, i64* %base, i64 %offset + %base_addr = bitcast i64* %base_i64 to * + %data = call @llvm.masked.load.nxv2i64(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv2i64( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv2f16(half * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_half = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_half to * + %data = call @llvm.masked.load.nxv2f16(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv2f16( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv2f32(float * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_float = getelementptr float, float* %base, i64 %offset + %base_addr = bitcast float* %base_float to * + %data = call @llvm.masked.load.nxv2f32(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv2f32( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv2f64(double * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %base_double = getelementptr double, double* %base, i64 %offset + %base_addr = bitcast double* %base_double to * + %data = call @llvm.masked.load.nxv2f64(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv2f64( %data, * %base_addr, i32 1, %mask) + ret void +} + +; 2-lane zero/sign extended contiguous loads. + +define @masked_zload_sv2i8_to_sv2i64(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv2i8(* %base_addr, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i8_to_sv2i64(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv2i8(* %base_addr, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +define @masked_zload_sv2i16_to_sv2i64(i16* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %load = call @llvm.masked.load.nxv2i16(* %base_addr, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i16_to_sv2i64(i16* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %load = call @llvm.masked.load.nxv2i16(* %base_addr, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + + +define @masked_zload_sv2i32_to_sv2i64(i32* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %load = call @llvm.masked.load.nxv2i32(* %base_addr, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv2i32_to_sv2i64(i32* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %load = call @llvm.masked.load.nxv2i32(* %base_addr, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +; 2-lane truncating contiguous stores. + +define void @masked_trunc_store_sv2i64_to_sv2i8( %val, i8 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: +; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i8( %trunc, *%base_addr, i32 8, %mask) + ret void +} + +define void @masked_trunc_store_sv2i64_to_sv2i16( %val, i16 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i16( %trunc, *%base_addr, i32 8, %mask) + ret void +} + +define void @masked_trunc_store_sv2i64_to_sv2i32( %val, i32 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: +; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv2i32( %trunc, *%base_addr, i32 8, %mask) + ret void +} + +; 4-lane contiguous load/stores. + +define void @test_masked_ldst_sv4i8(i8 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1] +; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %data = call @llvm.masked.load.nxv4i8(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv4i8( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv4i16(i16 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i16: +; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %data = call @llvm.masked.load.nxv4i16(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv4i16( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv4i32(i32 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %data = call @llvm.masked.load.nxv4i32(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv4i32( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv4f16(half * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_f16 = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_f16 to * + %data = call @llvm.masked.load.nxv4f16(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv4f16( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv4f32(float * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_f32 = getelementptr float, float* %base, i64 %offset + %base_addr = bitcast float* %base_f32 to * + %data = call @llvm.masked.load.nxv4f32(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv4f32( %data, * %base_addr, i32 1, %mask) + ret void +} + +; 4-lane zero/sign extended contiguous loads. + +define @masked_zload_sv4i8_to_sv4i32(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv4i8(* %base_addr, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv4i8_to_sv4i32(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv4i8(* %base_addr, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +define @masked_zload_sv4i16_to_sv4i32(i16* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %load = call @llvm.masked.load.nxv4i16(* %base_addr, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv4i16_to_sv4i32(i16* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %load = call @llvm.masked.load.nxv4i16(* %base_addr, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +; 4-lane truncating contiguous stores. + +define void @masked_trunc_store_sv4i32_to_sv4i8( %val, i8 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: +; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv4i8( %trunc, *%base_addr, i32 8, %mask) + ret void +} + +define void @masked_trunc_store_sv4i32_to_sv4i16( %val, i16 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: +; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv4i16( %trunc, *%base_addr, i32 8, %mask) + ret void +} + +; 8-lane contiguous load/stores. + +define void @test_masked_ldst_sv8i8(i8 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i8: +; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1] +; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %data = call @llvm.masked.load.nxv8i8(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv8i8( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv8i16(i16 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %data = call @llvm.masked.load.nxv8i16(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv8i16( %data, * %base_addr, i32 1, %mask) + ret void +} + +define void @test_masked_ldst_sv8f16(half * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_f16 = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_f16 to * + %data = call @llvm.masked.load.nxv8f16(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv8f16( %data, * %base_addr, i32 1, %mask) + ret void +} + +; 8-lane zero/sign extended contiguous loads. + +define @masked_zload_sv8i8_to_sv8i16(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv8i8(* %base_addr, i32 1, %mask, undef) + %ext = zext %load to + ret %ext +} + +define @masked_sload_sv8i8_to_sv8i16(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %load = call @llvm.masked.load.nxv8i8(* %base_addr, i32 1, %mask, undef) + %ext = sext %load to + ret %ext +} + +; 8-lane truncating contiguous stores. + +define void @masked_trunc_store_sv8i16_to_sv8i8( %val, i8 *%base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: +; C HECK: st1b { z0.h }, p0, [x0, x1] +; C HECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %trunc = trunc %val to + call void @llvm.masked.store.nxv8i8( %trunc, *%base_addr, i32 8, %mask) + ret void +} + +; 16-lane contiguous load/stores. + +define void @test_masked_ldst_sv16i8(i8 * %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1] +; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %data = call @llvm.masked.load.nxv16i8(* %base_addr, i32 1, %mask, undef) + call void @llvm.masked.store.nxv16i8( %data, * %base_addr, i32 1, %mask) + ret void +} + +; 2-element contiguous loads. +declare @llvm.masked.load.nxv2i8 (* , i32, , ) +declare @llvm.masked.load.nxv2i16(*, i32, , ) +declare @llvm.masked.load.nxv2i32(*, i32, , ) +declare @llvm.masked.load.nxv2i64(*, i32, , ) +declare @llvm.masked.load.nxv2f16(*, i32, , ) +declare @llvm.masked.load.nxv2f32(*, i32, , ) +declare @llvm.masked.load.nxv2f64(*, i32, , ) + +; 4-element contiguous loads. +declare @llvm.masked.load.nxv4i8 (* , i32, , ) +declare @llvm.masked.load.nxv4i16(*, i32, , ) +declare @llvm.masked.load.nxv4i32(*, i32, , ) +declare @llvm.masked.load.nxv4f16(*, i32, , ) +declare @llvm.masked.load.nxv4f32(*, i32, , ) + +; 8-element contiguous loads. +declare @llvm.masked.load.nxv8i8 (* , i32, , ) +declare @llvm.masked.load.nxv8i16(*, i32, , ) +declare @llvm.masked.load.nxv8f16(*, i32, , ) + +; 16-element contiguous loads. +declare @llvm.masked.load.nxv16i8(*, i32, , ) + +; 2-element contiguous stores. +declare void @llvm.masked.store.nxv2i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv2i16(, *, i32, ) +declare void @llvm.masked.store.nxv2i32(, *, i32, ) +declare void @llvm.masked.store.nxv2i64(, *, i32, ) +declare void @llvm.masked.store.nxv2f16(, *, i32, ) +declare void @llvm.masked.store.nxv2f32(, *, i32, ) +declare void @llvm.masked.store.nxv2f64(, *, i32, ) + +; 4-element contiguous stores. +declare void @llvm.masked.store.nxv4i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv4i16(, *, i32, ) +declare void @llvm.masked.store.nxv4i32(, *, i32, ) +declare void @llvm.masked.store.nxv4f16(, *, i32, ) +declare void @llvm.masked.store.nxv4f32(, *, i32, ) + +; 8-element contiguous stores. +declare void @llvm.masked.store.nxv8i8 ( , * , i32, ) +declare void @llvm.masked.store.nxv8i16(, *, i32, ) +declare void @llvm.masked.store.nxv8f16(, *, i32, ) + +; 16-element contiguous stores. +declare void @llvm.masked.store.nxv16i8(, *, i32, ) diff --git a/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll @@ -0,0 +1,147 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; Range checks: for all the instruction tested in this file, the +; immediate must be within the range [-8, 7] (4-bit immediate). Out of +; range values are tested only in one case (following). Valid values +; are tested all through the rest of the file. + +define void @imm_out_of_range( * %base, %mask) nounwind { +; CHECK-LABEL: imm_out_of_range: +; CHECK-NEXT: rdvl x8, #8 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}] +; CHECK-NEXT: rdvl x8, #-9 +; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x{{[0-9]+}}] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 8 + %data = call @llvm.aarch64.sve.ldnt1.nxv2i64( %mask, * %base_load) + %base_store = getelementptr , * %base, i64 -9 + call void @llvm.aarch64.sve.stnt1.nxv2i64( %data, %mask, * %base_store) + ret void +} + +; 2-lane non-temporal load/stores + + +define void @test_masked_ldst_sv2i64( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] +; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, #-7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -8 + %data = call @llvm.aarch64.sve.ldnt1.nxv2i64( %mask, * %base_load) + %base_store = getelementptr , * %base, i64 -7 + call void @llvm.aarch64.sve.stnt1.nxv2i64( %data, %mask, * %base_store) + ret void +} + +define void @test_masked_ldst_sv2f64( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl] +; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, #-5, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -6 + %data = call @llvm.aarch64.sve.ldnt1.nxv2f64( %mask,* %base_load) + %base_store = getelementptr , * %base, i64 -5 + call void @llvm.aarch64.sve.stnt1.nxv2f64( %data, %mask, * %base_store) + ret void +} + +; 4-lane non-temporal load/stores. + +define void @test_masked_ldst_sv4i32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.aarch64.sve.ldnt1.nxv4i32( %mask, * %base_load) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.aarch64.sve.stnt1.nxv4i32( %data, %mask, * %base_store) + ret void +} + +define void @test_masked_ldst_sv4f32( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.aarch64.sve.ldnt1.nxv4f32( %mask, * %base_load) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.aarch64.sve.stnt1.nxv4f32( %data, %mask, * %base_store) + ret void +} + + +; 8-lane non-temporal load/stores. + +define void @test_masked_ldst_sv8i16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.aarch64.sve.ldnt1.nxv8i16( %mask, * %base_load) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.aarch64.sve.stnt1.nxv8i16( %data, %mask, * %base_store) + ret void +} + +define void @test_masked_ldst_sv8f16( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] +; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #2, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 -1 + %data = call @llvm.aarch64.sve.ldnt1.nxv8f16( %mask, * %base_load) + %base_store = getelementptr , * %base, i64 2 + call void @llvm.aarch64.sve.stnt1.nxv8f16( %data, %mask, * %base_store) + ret void +} + +; 16-lane non-temporal load/stores. + +define void @test_masked_ldst_sv16i8( * %base, %mask) nounwind { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK-NEXT: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: stnt1b { z[[DATA]].b }, p0, [x0, #7, mul vl] +; CHECK-NEXT: ret + %base_load = getelementptr , * %base, i64 6 + %data = call @llvm.aarch64.sve.ldnt1.nxv16i8( %mask, * %base_load) + %base_store = getelementptr , * %base, i64 7 + call void @llvm.aarch64.sve.stnt1.nxv16i8( %data, %mask, * %base_store) + ret void +} + +; 2-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv2i64(, *) +declare @llvm.aarch64.sve.ldnt1.nxv2f64(, *) + +; 4-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv4i32(, *) +declare @llvm.aarch64.sve.ldnt1.nxv4f32(, *) + +; 8-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv8i16(, *) +declare @llvm.aarch64.sve.ldnt1.nxv8f16(, *) + +; 16-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv16i8(, *) + +; 2-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv2i64(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv2f64(, , *) + +; 4-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv4i32(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv4f32(, , *) + +; 8-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv8i16(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv8f16(, , *) + +; 16-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv16i8(, , *) + diff --git a/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll @@ -0,0 +1,124 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; 2-lane non-temporal load/stores + +define void @test_masked_ldst_sv2i64(i64* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2i64: +; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %base_i64 = getelementptr i64, i64* %base, i64 %offset + %base_addr = bitcast i64* %base_i64 to * + %data = call @llvm.aarch64.sve.ldnt1.nxv2i64( %mask, * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv2i64( %data, %mask, * %base_addr) + ret void +} + +define void @test_masked_ldst_sv2f64(double* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv2f64: +; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %base_double = getelementptr double, double* %base, i64 %offset + %base_addr = bitcast double* %base_double to * + %data = call @llvm.aarch64.sve.ldnt1.nxv2f64( %mask,* %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv2f64( %data, %mask, * %base_addr) + ret void +} + +; 4-lane non-temporal load/stores. + +define void @test_masked_ldst_sv4i32(i32* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4i32: +; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_i32 = getelementptr i32, i32* %base, i64 %offset + %base_addr = bitcast i32* %base_i32 to * + %data = call @llvm.aarch64.sve.ldnt1.nxv4i32( %mask, * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv4i32( %data, %mask, * %base_addr) + ret void +} + +define void @test_masked_ldst_sv4f32(float* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv4f32: +; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base_float = getelementptr float, float* %base, i64 %offset + %base_addr = bitcast float* %base_float to * + %data = call @llvm.aarch64.sve.ldnt1.nxv4f32( %mask, * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv4f32( %data, %mask, * %base_addr) + ret void +} + + +; 8-lane non-temporal load/stores. + +define void @test_masked_ldst_sv8i16(i16* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8i16: +; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_i16 = getelementptr i16, i16* %base, i64 %offset + %base_addr = bitcast i16* %base_i16 to * + %data = call @llvm.aarch64.sve.ldnt1.nxv8i16( %mask, * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv8i16( %data, %mask, * %base_addr) + ret void +} + +define void @test_masked_ldst_sv8f16(half* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv8f16: +; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base_half = getelementptr half, half* %base, i64 %offset + %base_addr = bitcast half* %base_half to * + %data = call @llvm.aarch64.sve.ldnt1.nxv8f16( %mask, * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv8f16( %data, %mask, * %base_addr) + ret void +} + +; 16-lane non-temporal load/stores. + +define void @test_masked_ldst_sv16i8(i8* %base, %mask, i64 %offset) nounwind { +; CHECK-LABEL: test_masked_ldst_sv16i8: +; CHECK-NEXT: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1] +; CHECK-NEXT: stnt1b { z[[DATA]].b }, p0, [x0, x1] +; CHECK-NEXT: ret + %base_i8 = getelementptr i8, i8* %base, i64 %offset + %base_addr = bitcast i8* %base_i8 to * + %data = call @llvm.aarch64.sve.ldnt1.nxv16i8( %mask, * %base_addr) + call void @llvm.aarch64.sve.stnt1.nxv16i8( %data, %mask, * %base_addr) + ret void +} + +; 2-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv2i64(, *) +declare @llvm.aarch64.sve.ldnt1.nxv2f64(, *) + +; 4-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv4i32(, *) +declare @llvm.aarch64.sve.ldnt1.nxv4f32(, *) + +; 8-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv8i16(, *) +declare @llvm.aarch64.sve.ldnt1.nxv8f16(, *) + +; 16-element non-temporal loads. +declare @llvm.aarch64.sve.ldnt1.nxv16i8(, *) + +; 2-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv2i64(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv2f64(, , *) + +; 4-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv4i32(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv4f32(, , *) + +; 8-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv8i16(, , *) +declare void @llvm.aarch64.sve.stnt1.nxv8f16(, , *) + +; 16-element non-temporal stores. +declare void @llvm.aarch64.sve.stnt1.nxv16i8(, , *) diff --git a/llvm/test/CodeGen/AArch64/sve-vscale-combine.ll b/llvm/test/CodeGen/AArch64/sve-vscale-combine.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vscale-combine.ll @@ -0,0 +1,98 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s |FileCheck %s + +declare i32 @llvm.vscale.i32() +declare i64 @llvm.vscale.i64() + +; Fold (add (vscale * C0), (vscale C1)) to (vscale * (C0 + C1))) +define i64 @combine_add_vscale_i64() nounwind { +; CHECK-LABEL: combine_add_vscale_i64: +; CHECK-NOT: add +; CHECK-NEXT: cntd x0 +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %add = add i64 %vscale, %vscale + ret i64 %add +} + +define i32 @combine_add_vscale_i32() nounwind { +; CHECK-LABEL: combine_add_vscale_i32: +; CHECK-NOT: add +; CHECK-NEXT: cntd x0 +; CHECK-NEXT: ret + %vscale = call i32 @llvm.vscale.i32() + %add = add i32 %vscale, %vscale + ret i32 %add +} + +; Fold (mul (vscale * C0), C1) to (vscale * C0 * C1))) +; In this test, C0 = 1, C1 = 16. +define i64 @combine_mul_vscale_i64() nounwind { +; CHECK-LABEL: combine_mul_vscale_i64: +; CHECK-NOT: mul +; CHECK-NEXT: rdvl x0, #2 +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %mul = mul i64 %vscale, 32 + ret i64 %mul +} + +define i32 @combine_mul_vscale_i32() nounwind { +; CHECK-LABEL: combine_mul_vscale_i32: +; CHECK-NOT: mul +; CHECK-NEXT: rdvl x0, #3 +; CHECK-NEXT: ret + %vscale = call i32 @llvm.vscale.i32() + %mul = mul i32 %vscale, 48 + ret i32 %mul +} + +; Canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C)) +define i64 @combine_sub_vscale_i64(i64 %in) nounwind { +; CHECK-LABEL: combine_sub_vscale_i64: +; CHECK-NOT: sub +; CHECK-NEXT: rdvl x8, #-1 +; CHECK-NEXT: asr x8, x8, #4 +; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %sub = sub i64 %in, %vscale + ret i64 %sub +} + +define i32 @combine_sub_vscale_i32(i32 %in) nounwind { +; CHECK-LABEL: combine_sub_vscale_i32: +; CHECK-NOT: sub +; CHECK-NEXT: rdvl x8, #-1 +; CHECK-NEXT: asr x8, x8, #4 +; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: ret + %vscale = call i32 @llvm.vscale.i32() + %sub = sub i32 %in, %vscale + ret i32 %sub +} + + +; Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1))) +; C0 = 1 , C1 = 4 +; At IR level, %shl = 2^4 * VSCALE. +; At Assembly level, the output of RDVL is also 2^4 * VSCALE. +; Hence, the immediate for RDVL is #1. +define i64 @combine_shl_vscale_i64() nounwind { +; CHECK-LABEL: combine_shl_vscale_i64: +; CHECK-NOT: shl +; CHECK-NEXT: rdvl x0, #1 +; CHECK-NEXT: ret + %vscale = call i64 @llvm.vscale.i64() + %shl = shl i64 %vscale, 4 + ret i64 %shl +} + +define i32 @combine_shl_vscale_i32() nounwind { +; CHECK-LABEL: combine_shl_vscale_i32: +; CHECK-NOT: shl +; CHECK-NEXT: rdvl x0, #1 +; CHECK-NEXT: ret + %vscale = call i32 @llvm.vscale.i32() + %shl = shl i32 %vscale, 4 + ret i32 %shl +}