diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -172,6 +172,25 @@ return isLegalMaskedLoadStore(DataType, Alignment); } + bool isLegalNTStore(Type *DataType, Align Alignment) { + // Vector stores can be directly lowered to STNP, if the vector can be + // halved so that each half fits into a register. Here we only check for + // vectors with total sizes matching the total number of bits STNP can + // store. + if (isa(DataType) && DataType->getVectorNumElements() > 1) { + unsigned StoreSize = + DataType->getScalarSizeInBits() * DataType->getVectorNumElements(); + + Type *Ty = DataType->getVectorElementType(); + return (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) || + Ty->isIntegerTy(32) || Ty->isIntegerTy(64) || + Ty->isIntegerTy(128)) && + (StoreSize == 16 || StoreSize == 32 || StoreSize == 64 || + StoreSize == 128 || StoreSize == 256); + } + return BaseT::isLegalMaskedStore(DataType, Alignment); + } + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, unsigned AddressSpace, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll @@ -0,0 +1,82 @@ +; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphones -force-vector-width=4 -force-vector-interleave=1 %s -S | FileCheck %s + +define void @test_i32_store(i32* nocapture %ddst, i32 %pattern) local_unnamed_addr #0 { +; CHECK-LABEL: define void @test_i32_store( +; CHECK-LABEL: vector.body: +; CHECK: store <16 x i32> {{.*}} !nontemporal !0 +; CHECK: br +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.013 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %ddst.addr.012 = phi i32* [ %ddst, %entry ], [ %incdec.ptr3, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ddst.addr.012, i64 1 + store i32 10, i32* %ddst.addr.012, align 4, !nontemporal !8 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ddst.addr.012, i64 2 + store i32 20, i32* %incdec.ptr, align 4, !nontemporal !8 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ddst.addr.012, i64 3 + store i32 30, i32* %incdec.ptr1, align 4, !nontemporal !8 + %incdec.ptr3 = getelementptr inbounds i32, i32* %ddst.addr.012, i64 4 + store i32 40, i32* %incdec.ptr2, align 4, !nontemporal !8 + %add = add nuw nsw i32 %i.013, 4 + %cmp = icmp ult i32 %i.013, 4092 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret void +} + + +define void @test_i33_store(i33* nocapture %ddst) { +; CHECK-LABEL: define void @test_i33_store( +; CHECK-NOT: vector.body: +; CHECK: ret +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.013 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %ddst.addr.012 = phi i33* [ %ddst, %entry ], [ %incdec.ptr3, %for.body ] + %incdec.ptr = getelementptr inbounds i33, i33* %ddst.addr.012, i64 1 + store i33 10, i33* %ddst.addr.012, align 4, !nontemporal !8 + %incdec.ptr1 = getelementptr inbounds i33, i33* %ddst.addr.012, i64 2 + store i33 20, i33* %incdec.ptr, align 4, !nontemporal !8 + %incdec.ptr2 = getelementptr inbounds i33, i33* %ddst.addr.012, i64 3 + store i33 30, i33* %incdec.ptr1, align 4, !nontemporal !8 + %incdec.ptr3 = getelementptr inbounds i33, i33* %ddst.addr.012, i64 4 + store i33 40, i33* %incdec.ptr2, align 4, !nontemporal !8 + %add = add nuw nsw i32 %i.013, 3 + %cmp = icmp ult i32 %i.013, 4092 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret void +} + + +define void @test_i64_store(i64* nocapture %ddst) local_unnamed_addr #0 { +; CHECK-LABEL: define void @test_i64_store( +; CHECK-LABEL: vector.body: +; CHECK: store <4 x i64> {{.*}} !nontemporal !0 +; CHECK: br +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.013 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %ddst.addr.012 = phi i64* [ %ddst, %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i64, i64* %ddst.addr.012, i64 1 + store i64 10, i64* %ddst.addr.012, align 4, !nontemporal !8 + %add = add nuw nsw i32 %i.013, 4 + %cmp = icmp ult i32 %i.013, 4092 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body + ret void +} + +!8 = !{i32 1}