diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -899,6 +899,7 @@ bool isExtFreeImpl(const Instruction *Ext) const override; void addTypeForNEON(MVT VT); + void addTypeForStreamingSVE(MVT VT); void addTypeForFixedLengthSVE(MVT VT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1389,6 +1389,15 @@ for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + if (Subtarget->forceStreamingCompatibleSVE()) { + for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, + MVT::v4i32, MVT::v2i64}) + addTypeForStreamingSVE(VT); + + for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64}) + addTypeForStreamingSVE(VT); + } + // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. if (Subtarget->useSVEForFixedLengthVectors()) { @@ -1595,6 +1604,23 @@ return false; } +void AArch64TargetLowering::addTypeForStreamingSVE(MVT VT) { + if(VT.isInteger()) { + setOperationAction(ISD::ANY_EXTEND, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND, VT, Custom); + setOperationAction(ISD::SIGN_EXTEND, VT, Custom); + } + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); + setOperationAction(ISD::ABS, VT, Custom); + setOperationAction(ISD::XOR, VT, Custom); +} + void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); @@ -5754,8 +5780,7 @@ case ISD::MLOAD: return LowerMLOAD(Op, DAG); case ISD::LOAD: - if (useSVEForFixedLengthVectorVT(Op.getValueType(), - Subtarget->forceStreamingCompatibleSVE())) + if (useSVEForFixedLengthVectorVT(Op.getValueType())) return LowerFixedLengthVectorLoadToSVE(Op, DAG); return LowerLOAD(Op, DAG); case ISD::ADD: @@ -11349,7 +11374,12 @@ // Try 32-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, - const SDValue *LHS = nullptr) { + const SDValue *LHS = nullptr, + const AArch64Subtarget *const Subtarget = nullptr) { + EVT VT = Op.getValueType(); + if(Subtarget && VT.isFixedLengthVector() && Subtarget->forceStreamingCompatibleSVE()) + return SDValue(); + if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); @@ -11397,7 +11427,12 @@ // Try 16-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, - const SDValue *LHS = nullptr) { + const SDValue *LHS = nullptr, + const AArch64Subtarget *const Subtarget = nullptr) { + EVT VT = Op.getValueType(); + if(Subtarget && VT.isFixedLengthVector() && Subtarget->forceStreamingCompatibleSVE()) + return SDValue(); + if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); @@ -12078,7 +12113,8 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { - if (useSVEForFixedLengthVectorVT(Op.getValueType())) + if (useSVEForFixedLengthVectorVT(Op.getValueType(), + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthConcatVectorsToSVE(Op, DAG); assert(Op.getValueType().isScalableVector() && @@ -12184,7 +12220,8 @@ return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType()); } - if (useSVEForFixedLengthVectorVT(VT)) + if (useSVEForFixedLengthVectorVT(VT, + Subtarget->forceStreamingCompatibleSVE())) return LowerFixedLengthExtractVectorElt(Op, DAG); // Check for non-constant or out of range lane. @@ -12246,10 +12283,12 @@ // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 && - InVT.getSizeInBits() == 128) + InVT.getSizeInBits() == 128 && + !Subtarget->forceStreamingCompatibleSVE()) return Op; - if (useSVEForFixedLengthVectorVT(InVT)) { + if (useSVEForFixedLengthVectorVT(InVT, + Subtarget->forceStreamingCompatibleSVE())) { SDLoc DL(Op); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); @@ -12437,7 +12476,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { // Currently no fixed length shuffles that require SVE are legal. - if (useSVEForFixedLengthVectorVT(VT)) + if (useSVEForFixedLengthVectorVT(VT, Subtarget->forceStreamingCompatibleSVE())) return false; if (VT.getVectorNumElements() == 4 && @@ -12547,7 +12586,9 @@ switch (Op.getOpcode()) { case ISD::SHL: - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) + if (VT.isScalableVector() || + useSVEForFixedLengthVectorVT(VT, + Subtarget->forceStreamingCompatibleSVE())) return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED); if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) @@ -12559,7 +12600,9 @@ Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: - if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) { + if (VT.isScalableVector() || + useSVEForFixedLengthVectorVT( + VT, Subtarget->forceStreamingCompatibleSVE())) { unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED : AArch64ISD::SRL_PRED; return LowerToPredicatedOp(Op, DAG, Opc); @@ -13958,6 +14001,11 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { + // Skip if streaming compatible SVE is enabled, + // because it generates invalid code in streaming mode when SVE length is not specified. + if(Subtarget->forceStreamingCompatibleSVE()) + return false; + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -15690,7 +15738,8 @@ } static SDValue performANDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *const Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -15725,16 +15774,16 @@ DefBits = ~DefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, - DefBits, &LHS)) || + DefBits, &LHS, Subtarget)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, - DefBits, &LHS))) + DefBits, &LHS, Subtarget))) return NewOp; UndefBits = ~UndefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG, - UndefBits, &LHS)) || + UndefBits, &LHS, Subtarget)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG, - UndefBits, &LHS))) + UndefBits, &LHS, Subtarget))) return NewOp; } @@ -20528,7 +20577,7 @@ case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::AND: - return performANDCombine(N, DCI); + return performANDCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: @@ -22335,7 +22384,7 @@ SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - assert(useSVEForFixedLengthVectorVT(VT) && + assert(VT.isFixedLengthVector() && isTypeLegal(VT) && "Only expected to lower fixed length vector operation!"); EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); @@ -22351,7 +22400,7 @@ } // "cast" fixed length vector to a scalable vector. - assert(useSVEForFixedLengthVectorVT(V.getValueType()) && + assert(useSVEForFixedLengthVectorVT(V.getValueType(), Subtarget->forceStreamingCompatibleSVE()) && "Only fixed length vectors are supported!"); Ops.push_back(convertToScalableVector(DAG, ContainerVT, V)); } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3032,7 +3032,7 @@ (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; // Extract element from vector with immediate index that's within the bottom 128-bits. - let AddedComplexity = 1 in { + let Predicates = [NotInStreamingSVEMode], AddedComplexity = 1 in { def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>; def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)), @@ -3042,7 +3042,7 @@ def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)), (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>; } - + let Predicates = [NotInStreamingSVEMode] in { def : Pat<(sext_inreg (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index), i8), (i32 (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>; def : Pat<(sext_inreg (anyext (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), i8), @@ -3055,7 +3055,7 @@ def : Pat<(sext (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)), (i64 (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>; - + } // Extract first element from vector. let AddedComplexity = 2 in { def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)), diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll @@ -6,8 +6,8 @@ define <4 x i8> @load_v4i8(<4 x i8>* %a) #0 { ; CHECK-LABEL: load_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %load = load <4 x i8>, <4 x i8>* %a @@ -44,12 +44,14 @@ define <2 x i16> @load_v2i16(<2 x i16>* %a) #0 { ; CHECK-LABEL: load_v2i16: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldrh w8, [x0, #2] -; CHECK-NEXT: ldrh w9, [x0] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: zip1 z0.s, z1.s, z0.s -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: str w8, [sp, #12] +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: str w8, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %load = load <2 x i16>, <2 x i16>* %a ret <2 x i16> %load diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll @@ -65,7 +65,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI5_0 ; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI5_0] -; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret store <2 x half> zeroinitializer, <2 x half>* %a ret void