Index: llvm/lib/IR/IRBuilder.cpp =================================================================== --- llvm/lib/IR/IRBuilder.cpp +++ llvm/lib/IR/IRBuilder.cpp @@ -528,14 +528,14 @@ CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, Align Alignment, Value *Mask, Value *PassThru, const Twine &Name) { - auto *PtrsTy = cast(Ptrs->getType()); + auto *PtrsTy = cast(Ptrs->getType()); auto *PtrTy = cast(PtrsTy->getElementType()); - unsigned NumElts = PtrsTy->getNumElements(); - auto *DataTy = FixedVectorType::get(PtrTy->getElementType(), NumElts); + ElementCount EC = PtrsTy->getElementCount(); + auto *DataTy = VectorType::get(PtrTy->getElementType(), EC); if (!Mask) Mask = Constant::getAllOnesValue( - FixedVectorType::get(Type::getInt1Ty(Context), NumElts)); + VectorType::get(Type::getInt1Ty(Context), EC)); if (!PassThru) PassThru = UndefValue::get(DataTy); @@ -558,20 +558,20 @@ /// be accessed in memory CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs, Align Alignment, Value *Mask) { - auto *PtrsTy = cast(Ptrs->getType()); - auto *DataTy = cast(Data->getType()); - unsigned NumElts = PtrsTy->getNumElements(); + auto *PtrsTy = cast(Ptrs->getType()); + auto *DataTy = cast(Data->getType()); + ElementCount EC = PtrsTy->getElementCount(); #ifndef NDEBUG auto PtrTy = cast(PtrsTy->getElementType()); - assert(NumElts == DataTy->getNumElements() && + assert(EC == DataTy->getElementCount() && PtrTy->getElementType() == DataTy->getElementType() && "Incompatible pointer and data types"); #endif if (!Mask) Mask = Constant::getAllOnesValue( - FixedVectorType::get(Type::getInt1Ty(Context), NumElts)); + VectorType::get(Type::getInt1Ty(Context), EC)); Type *OverloadedTypes[] = {DataTy, PtrsTy}; Value *Ops[] = {Data, Ptrs, getInt32(Alignment.value()), Mask}; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6428,8 +6428,18 @@ // relying on instcombine to remove them. // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract - unsigned Cost = getUniformMemOpCost(&I, VF); - setWideningDecision(&I, VF, CM_Scalarize, Cost); + + // For Scalable vectors, use G/S instructions instead of a scalarised + // store for now, until we add a mechanism to extract the last lane of + // the vector. This is allowed, since llvm.masked.scatter is guaranteed + // to be an ordered store (from first to last element). + if (VF.isScalable()) + setWideningDecision(&I, VF, CM_GatherScatter, + /*Cost=*/VF.getKnownMinValue()); + else { + unsigned Cost = getUniformMemOpCost(&I, VF); + setWideningDecision(&I, VF, CM_Scalarize, Cost); + } continue; } @@ -6446,6 +6456,14 @@ continue; } + // For Scalable vectors, all nodes must be widened using gather/scatter + // or interleaved loads/stores. For now, default to gather/scatter. + if (VF.isScalable()) { + setWideningDecision(&I, VF, CM_GatherScatter, + /*Cost=*/VF.getKnownMinValue()); + continue; + } + // Choose between Interleaving, Gather/Scatter or Scalarization. unsigned InterleaveCost = std::numeric_limits::max(); unsigned NumAccesses = 1; Index: llvm/test/Transforms/LoopVectorize/scalable-loop-invariant-store-unpredicated-body-scalar-tail.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/scalable-loop-invariant-store-unpredicated-body-scalar-tail.ll @@ -0,0 +1,68 @@ +; For now this test requires aarch64-registered-target, until we can +; also pass the loop hint as a 'force-vector-width' flag to opt. +; REQUIRES: aarch64-registered-target + +; RUN: opt -S -loop-vectorize -instcombine < %s | FileCheck %s + +source_filename = "loop.c" +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +; This test has an invariant store to a[42], which it vectorizes using a masked.scatter operation. + +; CHECK: for.body.preheader: +; CHECK-DAG: %wide.trip.count = zext i32 %N to i64 +; CHECK-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECK-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECK-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count + +; CHECK: vector.ph: +; CHECK-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECK-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECK-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]] +; CHECK: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf + +; CHECK: vector.body: +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index +; CHECK: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to * +; CHECK: %wide.load = load , * %[[IDXB_CAST]], align 8, !alias.scope !0 +; CHECK: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( undef, double 1.000000e+00, i32 0), undef, zeroinitializer) +; CHECK: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 42 +; CHECK: %[[SPLATINSERT:.*]] = insertelement undef, double* %[[IDXA]], i32 0 +; CHECK: %[[SPLAT:.*]] = shufflevector %[[SPLATINSERT]], undef, zeroinitializer +; CHECK: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( %[[FADD]], %[[SPLAT]], i32 8, shufflevector ( insertelement ( undef, i1 true, i32 0), undef, zeroinitializer)) +; CHECK: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECK: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 +; CHECK: %index.next = add i64 %index, %[[VSCALEX4]] +; CHECK: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec +; CHECK: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5 + +define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) { +entry: + %cmp7 = icmp sgt i32 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 42 + store double %add, double* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", !4} +!3 = !{!"llvm.loop.interleave.count", i32 1} +!4 = !{i32 4, i1 1}