Index: lib/Transforms/Scalar/LoopRotation.cpp =================================================================== --- lib/Transforms/Scalar/LoopRotation.cpp +++ lib/Transforms/Scalar/LoopRotation.cpp @@ -57,6 +57,17 @@ return getLoopPassPreservedAnalyses(); } +static bool isForcedVectorize(Loop *L) { + Optional Value = + findStringMetadataForLoop(L, "llvm.loop.vectorize.enable"); + if (!Value) + return false; + + const MDOperand *Op = *Value; + assert(Op && mdconst::hasa(*Op) && "invalid metadata"); + return mdconst::extract(*Op)->getZExtValue(); +} + namespace { class LoopRotateLegacyPass : public LoopPass { @@ -101,9 +112,11 @@ MemorySSA *MSSA = &getAnalysis().getMSSA(); MSSAU = MemorySSAUpdater(MSSA); } - return LoopRotation(L, LI, TTI, AC, DT, SE, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, - false, MaxHeaderSize, false); + + return LoopRotation( + L, LI, TTI, AC, DT, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, + SQ, false, + isForcedVectorize(L) ? DefaultRotationThreshold : MaxHeaderSize, false); } }; } Index: test/Transforms/LoopVectorize/AArch64/Oz-and-forced-vectorize.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/Oz-and-forced-vectorize.ll @@ -0,0 +1,59 @@ +; RUN: opt -Oz -S < %s | FileCheck %s + +; If vectorization is forced then -Oz should be overriden to disable loop +; header duplication in loop rotation. + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios5.0.0" + +define void @foo(float* noalias nocapture %ptrA, float* noalias nocapture readonly %ptrB, i64 %size) { +entry: + br label %for.cond, !dbg !10 + +for.cond: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ], !dbg !11 + %exitcond = icmp eq i64 %indvars.iv, %size, !dbg !12 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !13 + +for.cond.cleanup: ; preds = %for.cond + ret void, !dbg !14 + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds float, float* %ptrB, i64 %indvars.iv, !dbg !15 + %0 = load float, float* %arrayidx, align 4, !dbg !15, !tbaa !16 + %arrayidx2 = getelementptr inbounds float, float* %ptrA, i64 %indvars.iv, !dbg !20 + %1 = load float, float* %arrayidx2, align 4, !dbg !21, !tbaa !16 + %mul3 = fmul float %0, %1, !dbg !21 +; CHECK: fmul <4 x float> + store float %mul3, float* %arrayidx2, align 4, !dbg !21, !tbaa !16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !22 + br label %for.cond, !dbg !13, !llvm.loop !23 +} + +!llvm.module.flags = !{!3, !4, !5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, nameTableKind: GNU) +!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 7, !"PIC Level", i32 2} +!7 = distinct !DISubprogram(name: "foo", scope: !8, file: !8, line: 3, type: !9, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!8 = !DIFile(filename: "/tmp/s.c", directory: "") +!9 = !DISubroutineType(types: !2) +!10 = !DILocation(line: 13, column: 14, scope: !7) +!11 = !DILocation(line: 0, scope: !7) +!12 = !DILocation(line: 13, column: 36, scope: !7) +!13 = !DILocation(line: 13, column: 9, scope: !7) +!14 = !DILocation(line: 18, column: 1, scope: !7) +!15 = !DILocation(line: 15, column: 33, scope: !7) +!16 = !{!17, !17, i64 0} +!17 = !{!"float", !18, i64 0} +!18 = !{!"omnipotent char", !19, i64 0} +!19 = !{!"Simple C/C++ TBAA"} +!20 = !DILocation(line: 15, column: 13, scope: !7) +!21 = !DILocation(line: 15, column: 30, scope: !7) +!22 = !DILocation(line: 13, column: 44, scope: !7) +!23 = distinct !{!23, !13, !24, !25} +!24 = !DILocation(line: 16, column: 9, scope: !7) +!25 = !{!"llvm.loop.vectorize.enable", i1 true} Index: test/Transforms/LoopVectorize/AArch64/Oz-and-unforced-vectorize.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/Oz-and-unforced-vectorize.ll @@ -0,0 +1,58 @@ +; RUN: opt -Oz -S < %s | FileCheck %s + +; -Oz does not allow for loop header duplication so we can't vectorize this. +; See Oz-and-unforced-vectorize.ll. + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios5.0.0" + +define void @foo(float* noalias nocapture %ptrA, float* noalias nocapture readonly %ptrB, i64 %size) { +entry: + br label %for.cond, !dbg !10 + +for.cond: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ], !dbg !11 + %exitcond = icmp eq i64 %indvars.iv, %size, !dbg !12 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !13 + +for.cond.cleanup: ; preds = %for.cond + ret void, !dbg !14 + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds float, float* %ptrB, i64 %indvars.iv, !dbg !15 + %0 = load float, float* %arrayidx, align 4, !dbg !15, !tbaa !16 + %arrayidx2 = getelementptr inbounds float, float* %ptrA, i64 %indvars.iv, !dbg !20 + %1 = load float, float* %arrayidx2, align 4, !dbg !21, !tbaa !16 + %mul3 = fmul float %0, %1, !dbg !21 +; CHECK-NOT: fmul <4 x float> + store float %mul3, float* %arrayidx2, align 4, !dbg !21, !tbaa !16 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !22 + br label %for.cond, !dbg !13, !llvm.loop !23 +} + +!llvm.module.flags = !{!3, !4, !5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, nameTableKind: GNU) +!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 7, !"PIC Level", i32 2} +!7 = distinct !DISubprogram(name: "foo", scope: !8, file: !8, line: 3, type: !9, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!8 = !DIFile(filename: "/tmp/s.c", directory: "") +!9 = !DISubroutineType(types: !2) +!10 = !DILocation(line: 13, column: 14, scope: !7) +!11 = !DILocation(line: 0, scope: !7) +!12 = !DILocation(line: 13, column: 36, scope: !7) +!13 = !DILocation(line: 13, column: 9, scope: !7) +!14 = !DILocation(line: 18, column: 1, scope: !7) +!15 = !DILocation(line: 15, column: 33, scope: !7) +!16 = !{!17, !17, i64 0} +!17 = !{!"float", !18, i64 0} +!18 = !{!"omnipotent char", !19, i64 0} +!19 = !{!"Simple C/C++ TBAA"} +!20 = !DILocation(line: 15, column: 13, scope: !7) +!21 = !DILocation(line: 15, column: 30, scope: !7) +!22 = !DILocation(line: 13, column: 44, scope: !7) +!23 = distinct !{!23, !13, !24} +!24 = !DILocation(line: 16, column: 9, scope: !7)