Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15914,7 +15914,8 @@ /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - bool IsLaneOp) { + bool IsLaneOp, + const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -15924,6 +15925,10 @@ if (VT.isScalableVector()) return SDValue(); + if (VT.isFixedLengthVector() && VT.getFixedSizeInBits() > 128 && + Subtarget->useSVEForFixedLengthVectors()) + return SDValue(); + unsigned LoadIdx = IsLaneOp ? 1 : 0; SDNode *LD = N->getOperand(LoadIdx).getNode(); // If it is not LOAD, can not do such combine. @@ -17432,11 +17437,12 @@ } static SDValue -performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { +performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { if (SDValue Res = removeRedundantInsertVectorElt(N)) return Res; - return performPostLD1Combine(N, DCI, true); + return performPostLD1Combine(N, DCI, true, Subtarget); } SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) { @@ -17559,7 +17565,7 @@ case AArch64ISD::CSEL: return performCSELCombine(N, DCI, DAG); case AArch64ISD::DUP: - return performPostLD1Combine(N, DCI, false); + return performPostLD1Combine(N, DCI, false, Subtarget); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case AArch64ISD::SPLICE: @@ -17592,7 +17598,7 @@ case AArch64ISD::SUNPKLO: return performSunpkloCombine(N, DAG); case ISD::INSERT_VECTOR_ELT: - return performInsertVectorEltCombine(N, DCI); + return performInsertVectorEltCombine(N, DCI, Subtarget); case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DAG); case ISD::VECREDUCE_ADD: Index: llvm/test/CodeGen/AArch64/sve-fixed-length-ld1-post-inc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fixed-length-ld1-post-inc.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mcpu=a64fx -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s + +; These tests are here to ensure we don't get a selection error caused +; by performPostLD1Combine, which should bail out if it sees SVE fixed length +; vectors. + +define dso_local void @test_fixed_ld1_post_inc(i8** %a) { +; CHECK-LABEL: test_fixed_ld1_post_inc: +; CHECK: // %bb.0: // %L.entry +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.d, vl8 +; CHECK-NEXT: mov z0.d, x8 +; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z0.d +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: // %L.LB2_4167 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d0, [x8] +; CHECK-NEXT: ldr d1, [x8] +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: fmad z0.d, p1/m, z0.d, z0.d +; CHECK-NEXT: mov z0.d, z0.d[5] +; CHECK-NEXT: str d0, [x8, #8] +; CHECK-NEXT: b .LBB0_1 +L.entry: + %0 = load i8*, i8** %a + %1 = getelementptr i8, i8* %0, i64 -8 + br label %L.LB2_4167 + +L.LB2_4167: ; preds = %L.LB2_4700, %L.entry + %2 = getelementptr i8, i8* %1, i64 undef + %3 = add i64 undef, undef + %4 = getelementptr i8, i8* %1, i64 %3 + br label %L.LB2_4700 + +L.LB2_4700: ; preds = %L.LB2_4700, %L.LB2_4167 + %.G0001p_5399.0 = phi i8* [ %4, %L.LB2_4167 ], [ %28, %L.LB2_4700 ] + %.G0003p_5409.0 = phi i8* [ undef, %L.LB2_4167 ], [ %27, %L.LB2_4700 ] + %.G0005p_5413.0 = phi i8* [ %2, %L.LB2_4167 ], [ %26, %L.LB2_4700 ] + %5 = getelementptr i8, i8* %.G0005p_5413.0, i64 -16 + %6 = bitcast i8* %5 to double* + %7 = getelementptr i8, i8* %.G0001p_5399.0, i64 -16 + %8 = bitcast i8* %7 to double* + store double undef, double* %8 + %9 = getelementptr i8, i8* %.G0005p_5413.0, i64 -8 + %10 = bitcast i8* %9 to double* + %11 = load <1 x double>, <1 x double>* undef + %12 = shufflevector <1 x double> %11, <1 x double> poison, <8 x i32> + %13 = insertelement <8 x double> %12, double undef, i32 1 + %14 = insertelement <8 x double> %13, double undef, i32 2 + %15 = insertelement <8 x double> %14, double undef, i32 3 + %16 = insertelement <8 x double> %15, double undef, i32 4 + %17 = load double, double* %6 + %18 = insertelement <8 x double> %16, double %17, i32 5 + %19 = insertelement <8 x double> %18, double undef, i32 6 + %20 = shufflevector <8 x double> %19, <8 x double> undef, <8 x i32> + %21 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> %20, <8 x double> undef) #2 + %22 = extractelement <8 x double> %21, i32 5 + store double %22, double* %10 + %23 = bitcast i8* %.G0003p_5409.0 to double* + %24 = bitcast i8* %.G0001p_5399.0 to double* + %25 = load double, double* %24 + store double undef, double* %23 + %26 = getelementptr i8, i8* %.G0005p_5413.0, i64 undef + %27 = getelementptr i8, i8* %.G0003p_5409.0, i64 undef + %28 = getelementptr i8, i8* %.G0001p_5399.0, i64 undef + br i1 undef, label %L.LB2_4700, label %L.LB2_4167 +} + +; Function Attrs: nofree nosync nounwind readnone speculatable willreturn +declare dso_local <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>)