Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15255,6 +15255,33 @@ return DAG.getZExtOrTrunc(Res, DL, VT); } +// Materialize : i1 = extract_vector_elt t37, Constant:i64<0> +// ... into: "ptrue p, all" + PTEST +static SDValue +performFirstTrueTestVectorCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT); + SelectionDAG &DAG = DCI.DAG; + + if (!Subtarget->hasSVE() || !DCI.isBeforeLegalize()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + EVT VT = N0.getValueType(); + + if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1) + return SDValue(); + + auto *Idx = dyn_cast(N->getOperand(1)); + if (!Idx || Idx->getZExtValue() != 0) + return SDValue(); + + // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0 + SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all); + return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE); +} + static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG) { SDLoc DL(N); @@ -18134,8 +18161,12 @@ N, DAG, Subtarget, useSVEForFixedLengthVectorVT(N->getValueType(0))); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); - case ISD::EXTRACT_VECTOR_ELT: - return performExtractVectorEltCombine(N, DAG); + case ISD::EXTRACT_VECTOR_ELT: { + SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget); + if (Res == SDValue()) + Res = performExtractVectorEltCombine(N, DAG); + return Res; + } case ISD::VECREDUCE_ADD: return performVecReduceAddCombine(N, DCI.DAG, Subtarget); case AArch64ISD::UADDV: Index: llvm/test/CodeGen/AArch64/sve-extract-element.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-extract-element.ll +++ llvm/test/CodeGen/AArch64/sve-extract-element.ll @@ -481,9 +481,9 @@ define i1 @test_lane0_16xi1( %a) #0 { ; CHECK-LABEL: test_lane0_16xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, mi ; CHECK-NEXT: ret %b = extractelement %a, i32 0 ret i1 %b Index: llvm/test/CodeGen/AArch64/sve-pture.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-pture.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @add_arrays(i32* nocapture %a, %b, %p0, %step) #0 { +; CHECK-LABEL: add_arrays: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p2.s +; CHECK-NEXT: .LBB0_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: add z3.s, z3.s, z0.s +; CHECK-NEXT: st1w { z3.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: add z4.d, z3.d, z1.d +; CHECK-NEXT: add z3.d, z3.d, z2.d +; CHECK-NEXT: cmphi p0.d, p1/z, z2.d, z3.d +; CHECK-NEXT: cmphi p3.d, p1/z, z1.d, z4.d +; CHECK-NEXT: uzp1 p0.s, p3.s, p0.s +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: b.mi .LBB0_1 +; CHECK-NEXT: // %bb.2: // %cleanup +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %predicate = phi [ %p0, %entry ], [ %predicate.next, %vector.body ] + %addr = getelementptr inbounds i32, i32* %a, i64 %index + %addload = bitcast i32* %addr to * + %wide.masked.load = call @llvm.masked.load.nxv4i32.p0nxv4i32(* %addload, i32 4, %predicate, undef) + %c = add %wide.masked.load, %b + %addstore = bitcast i32* %addr to * + call void @llvm.masked.store.nxv4i32.p0nxv4i32( %c, * %addstore, i32 4, %predicate) + %cond = icmp ult i64 %index, 4294967294 + call void @llvm.assume(i1 %cond) + %0 = call i64 @llvm.vscale.i64() + %shl2 = shl nuw nsw i64 %0, 2 + %index.next = add nuw nsw i64 %index, %shl2 + %splatinsert = insertelement poison, i64 %index.next, i64 0 + %splat = shufflevector %splatinsert, poison, zeroinitializer + %1 = add nuw %splat, %step + %predicate.next = icmp ult %1, %step + %test0 = extractelement %predicate.next, i64 0 + br i1 %test0, label %vector.body, label %cleanup + +cleanup: ; preds = %vector.body, %entry + ret void +} + +declare void @llvm.assume(i1 noundef) +declare i64 @llvm.vscale.i64() +declare @llvm.masked.load.nxv4i32.p0nxv4i32(*, i32 immarg, , ) +declare void @llvm.masked.store.nxv4i32.p0nxv4i32(, *, i32 immarg, ) + +attributes #0 = { nofree norecurse nosync nounwind uwtable vscale_range(1,16) "target-cpu"="generic" "target-features"="+neon,+sve,+v8a" "unsafe-fp-math"="true" } +