Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -504,6 +504,11 @@ /// Return true if the target supports masked store. bool isLegalMaskedLoad(Type *DataType) const; + /// Return true if the target supports nontemporal store. + bool isLegalNTStore(Type *DataType, unsigned Alignment) const; + /// Return true if the target supports nontemporal load. + bool isLegalNTLoad(Type *DataType, unsigned Alignment) const; + /// Return true if the target supports masked scatter. bool isLegalMaskedScatter(Type *DataType) const; /// Return true if the target supports masked gather. @@ -1087,6 +1092,8 @@ virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0; virtual bool isLegalMaskedStore(Type *DataType) = 0; virtual bool isLegalMaskedLoad(Type *DataType) = 0; + virtual bool isLegalNTStore(Type *DataType, unsigned Alignment) = 0; + virtual bool isLegalNTLoad(Type *DataType, unsigned Alignment) = 0; virtual bool isLegalMaskedScatter(Type *DataType) = 0; virtual bool isLegalMaskedGather(Type *DataType) = 0; virtual bool isLegalMaskedCompressStore(Type *DataType) = 0; @@ -1336,6 +1343,12 @@ bool isLegalMaskedLoad(Type *DataType) override { return Impl.isLegalMaskedLoad(DataType); } + bool isLegalNTStore(Type *DataType, unsigned Alignment) override { + return Impl.isLegalNTStore(DataType, Alignment); + } + bool isLegalNTLoad(Type *DataType, unsigned Alignment) override { + return Impl.isLegalNTLoad(DataType, Alignment); + } bool isLegalMaskedScatter(Type *DataType) override { return Impl.isLegalMaskedScatter(DataType); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -222,6 +222,10 @@ bool isLegalMaskedLoad(Type *DataType) { return false; } + bool isLegalNTStore(Type *DataType, unsigned Alignment) { return true; } + + bool isLegalNTLoad(Type *DataType, unsigned Alignment) { return true; } + bool isLegalMaskedScatter(Type *DataType) { return false; } bool isLegalMaskedGather(Type *DataType) { return false; } Index: include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -205,12 +205,13 @@ public: LoopVectorizationLegality( Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT, - TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, - std::function *GetLAA, LoopInfo *LI, - OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, - LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC) - : TheLoop(L), LI(LI), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA), - ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {} + TargetTransformInfo *TTI, TargetLibraryInfo *TLI, AliasAnalysis *AA, + Function *F, std::function *GetLAA, + LoopInfo *LI, OptimizationRemarkEmitter *ORE, + LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB, + AssumptionCache *AC) + : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT), + GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -406,6 +407,9 @@ /// unrolling. PredicatedScalarEvolution &PSE; + /// Target Transform Info. + TargetTransformInfo *TTI; + /// Target Library Info. TargetLibraryInfo *TLI; Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -177,6 +177,16 @@ return TTIImpl->isLegalMaskedLoad(DataType); } +bool TargetTransformInfo::isLegalNTStore(Type *DataType, + unsigned Alignment) const { + return TTIImpl->isLegalNTStore(DataType, Alignment); +} + +bool TargetTransformInfo::isLegalNTLoad(Type *DataType, + unsigned Alignment) const { + return TTIImpl->isLegalNTLoad(DataType, Alignment); +} + bool TargetTransformInfo::isLegalMaskedGather(Type *DataType) const { return TTIImpl->isLegalMaskedGather(DataType); } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -186,6 +186,8 @@ bool canMacroFuseCmp(); bool isLegalMaskedLoad(Type *DataType); bool isLegalMaskedStore(Type *DataType); + bool isLegalNTLoad(Type *DataType, unsigned Alignment); + bool isLegalNTStore(Type *DataType, unsigned Alignment); bool isLegalMaskedGather(Type *DataType); bool isLegalMaskedScatter(Type *DataType); bool isLegalMaskedExpandLoad(Type *DataType); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -3140,6 +3140,41 @@ return isLegalMaskedLoad(DataType); } +bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) { + unsigned DataSize = DL.getTypeStoreSize(DataType); + // The only supported nontemporal loads are for aligned vectors of 16 or 32 + // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 + // (the equivalent stores only require AVX). + if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) + return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); + + return false; +} + +bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) { + unsigned DataSize = DL.getTypeStoreSize(DataType); + + // SSE4A supports nontermporal stores of float and double at arbitrary + // alignment. + if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) + return true; + + // Besides the SSE4A subtarget exception above, only aligned stores are + // available nontemporaly on any other subtarget. And only stores with a size + // of 4..32 bytes (powers of 2, only) are permitted. + if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || + !isPowerOf2_32(DataSize)) + return false; + + // 32-byte vector nontemporal stores are supported by AVX (the equivalent + // loads require AVX2). + if (DataSize == 32) + return ST->hasAVX(); + else if (DataSize == 16) + return ST->hasSSE1(); + return true; +} + bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { if (!isa(DataTy)) return false; Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -742,13 +742,40 @@ return false; } - // Check that the stored type is vectorizable. if (auto *ST = dyn_cast(&I)) { Type *T = ST->getValueOperand()->getType(); + // Check that the stored type is vectorizable. if (!VectorType::isValidElementType(T)) { ORE->emit(createMissedAnalysis("CantVectorizeStore", ST) << "store instruction cannot be vectorized"); return false; + + // For nontemporal stores, check that a nontemporal vector version is + // supported on the target. + } else if (ST->getMetadata(LLVMContext::MD_nontemporal)) { + // Arbitrarily try a vector of 2 elements. + Type *VecTy = VectorType::get(T, /*NumElements=*/2); + assert(VecTy && "did not find vectorized version of stored type"); + unsigned Alignment = getLoadStoreAlignment(ST); + if (!TTI->isLegalNTStore(VecTy, Alignment)) { + ORE->emit(createMissedAnalysis("CantVectorizeNontemporalStore", ST) + << "nontemporal store instruction cannot be vectorized"); + return false; + } + } + + } else if (auto *LD = dyn_cast(&I)) { + if (LD->getMetadata(LLVMContext::MD_nontemporal)) { + // For nontemporal loads, check that a nontemporal vector version is + // supported on the target (arbitrarily try a vector of 2 elements). + Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2); + assert(VecTy && "did not find vectorized version of load type"); + unsigned Alignment = getLoadStoreAlignment(LD); + if (!TTI->isLegalNTLoad(VecTy, Alignment)) { + ORE->emit(createMissedAnalysis("CantVectorizeNontemporalLoad", LD) + << "nontemporal load instruction cannot be vectorized"); + return false; + } } // FP instructions can allow unsafe algebra, thus vectorizable by Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7270,7 +7270,7 @@ // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements(*ORE); - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE, + LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, &Requirements, &Hints, DB, AC); if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Index: test/Transforms/LoopVectorize/X86/nontemporal.ll =================================================================== --- test/Transforms/LoopVectorize/X86/nontemporal.ll +++ test/Transforms/LoopVectorize/X86/nontemporal.ll @@ -0,0 +1,112 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s + +; The three test-cases below are all based on modified versions of a simple copy-loop: +; +; void foo(unsigned *src, unsigned *dst, unsigned nElts) { +; for (unsigned i = 0; i < nElts; ++i) { +; unsigned tmp = src[i]; +; dst[i] = tmp; +; } +; } +; +; In the first version, there are no nontemporal stores or loads, and so vectorization +; is safely done. +; +; In the second version, the store into dst[i] has the nontemporal hint. The alignment +; on X86_64 for 'unsigned' is 4, so the vector store generally will not be aligned to the +; vector size (of 16 here). Unaligned nontemporal vector stores are not supported on X86_64, +; and so the vectorization is suppressed (because when vectorizing it, the nontemoral hint +; would not be honored in the final code-gen). +; +; The third version is analogous to the second, except rather than the store, it is the +; load from 'src[i]' that has the nontemporal hint. Vectorization is suppressed in this +; case because (like stores) unaligned nontemoral vector loads are not supported on X86_64. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64" + +; CHECK-LABEL: @vectorTest( +define void @vectorTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) { +entry: + %cmp8 = icmp eq i32 %nElts, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %nElts to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] +; Check that we vectorized the load, and that there is no nontemporal hint. +; CHECK: %wide.load = load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 4{{$}} + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 +; Check that we vectorized the store, and that there is no nontemporal hint. +; CHECK: store <4 x i32> %wide.load, <4 x i32>* %{{[0-9]+}}, align 4{{$}} + %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @vectorNTStoreTest( +; Check that the vectorized type of the store does not appear. +; CHECK-NOT: 4 x i32 +define void @vectorNTStoreTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) { +entry: + %cmp8 = icmp eq i32 %nElts, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %nElts to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv +; Check that the store is not vectorized and that we don't lose the !nontemporal hint in it. +; CHECK: store i32 %{{[0-9]+}}, i32* %arrayidx2, align 4, !nontemporal !4 + store i32 %0, i32* %arrayidx2, align 4, !nontemporal !0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: @vectorNTLoadTest( +; Check that the vectorized type of the load does not appear. +; CHECK-NOT: 4 x i32 +define void @vectorNTLoadTest(i32* noalias readonly %src, i32* noalias %dst, i32 %nElts) { +entry: + %cmp8 = icmp eq i32 %nElts, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %nElts to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %indvars.iv +; Check that the load is not vectorized and that we don't lose the !nontemporal hint in it. +; CHECK: load i32, i32* %arrayidx, align 4, !nontemporal !4 + %0 = load i32, i32* %arrayidx, align 4, !nontemporal !0 + %arrayidx2 = getelementptr inbounds i32, i32* %dst, i64 %indvars.iv + store i32 %0, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +!0 = !{i32 1}