Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15931,7 +15931,8 @@ /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - bool IsLaneOp) { + bool IsLaneOp, + const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -15941,6 +15942,10 @@ if (VT.isScalableVector()) return SDValue(); + if (VT.isFixedLengthVector() && VT.getFixedSizeInBits() > 128 && + Subtarget->useSVEForFixedLengthVectors()) + return SDValue(); + unsigned LoadIdx = IsLaneOp ? 1 : 0; SDNode *LD = N->getOperand(LoadIdx).getNode(); // If it is not LOAD, can not do such combine. @@ -17449,11 +17454,12 @@ } static SDValue -performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { +performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { if (SDValue Res = removeRedundantInsertVectorElt(N)) return Res; - return performPostLD1Combine(N, DCI, true); + return performPostLD1Combine(N, DCI, true, Subtarget); } SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { @@ -17576,7 +17582,7 @@ case AArch64ISD::CSEL: return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: - return performPostLD1Combine(N, DCI, false); + return performPostLD1Combine(N, DCI, false, Subtarget); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case AArch64ISD::SPLICE: @@ -17609,7 +17615,7 @@ case AArch64ISD::SUNPKLO: return performSunpkloCombine(N, DAG); case ISD::INSERT_VECTOR_ELT: - return performInsertVectorEltCombine(N, DCI); + return performInsertVectorEltCombine(N, DCI, Subtarget); case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DAG); case ISD::VECREDUCE_ADD: Index: llvm/test/CodeGen/AArch64/sve-fixed-length-ld1-post-inc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fixed-length-ld1-post-inc.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mcpu=a64fx -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s + +; This test is here to ensure we don't get a selection error caused +; by performPostLD1Combine, which should bail out if it sees fixed length +; vectors. + +define dso_local void @test_fixed_ld1_post_inc(i8** %a, double* %b, i8* %c) { +; CHECK-LABEL: test_fixed_ld1_post_inc: +; CHECK: // %bb.0: // %L.entry +; CHECK-NEXT: index z3.d, #0, #1 +; CHECK-NEXT: ldr d0, [x8] +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: cmpeq p0.d, p0/z, z3.d, z2.d +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: ptrue p0.d, vl8 +; CHECK-NEXT: fmad z0.d, p0/m, z0.d, z0.d +; CHECK-NEXT: mov z0.d, z0.d[5] +; CHECK-NEXT: stur d0, [x2, #-8] +; CHECK-NEXT: ldr d0, [x1, #8] +; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: ret +L.entry: + %0 = getelementptr i8, i8* %c, i64 -8 + %1 = bitcast i8* %0 to double* + %2 = load <1 x double>, <1 x double>* undef + %r = load double, double* %b + %3 = shufflevector <1 x double> %2, <1 x double> poison, <8 x i32> + %4 = insertelement <8 x double> %3, double %r, i32 1 + %5 = shufflevector <8 x double> %4, <8 x double> undef, <8 x i32> + %6 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> %5, <8 x double> undef) #2 + %7 = extractelement <8 x double> %6, i32 5 + store double %7, double* %1 + %8 = getelementptr double, double* %b, i64 1 + %9 = load double, double* %8 + store double %9, double* %b + ret void +} + +; Function Attrs: nofree nosync nounwind readnone speculatable willreturn +declare dso_local <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>)