diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1196,7 +1196,7 @@
           return VLOperands::ScoreFail;
         // The distance is too large - still may be profitable to use masked
         // loads/gathers.
-        if (std::abs(*Dist) > NumLanes / 2)
+        if (std::abs(*Dist) > 1)
           return VLOperands::ScoreAltOpcodes;
         // This still will detect consecutive loads, but we might have "holes"
         // in some cases. It is ok for non-power-2 vectorization and may produce
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -0,0 +1,115 @@
+; RUN: opt < %s -slp-vectorizer -S -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 | FileCheck %s
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+@a = common dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 16
+
+define void @s116_orig() {
+; CHECK-LABEL: @s116_orig(
+; CHECK: [[BASE:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %0
+; CHECK: [[BASE2:%.*]] = bitcast float* [[BASE]] to <4 x float>*
+; CHECK: [[VEC1:%.*]] = load <4 x float>, <4 x float>* [[BASE2]]
+; CHECK: [[VEC2A:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i32 0
+; CHECK: [[EL1:%.*]] = extractelement <4 x float> [[VEC1]], i32 0
+; CHECK: [[VEC2B:%.*]] = insertelement <4 x float> [[VEC2A]], float [[EL1]], i32 1
+; CHECK: [[EL2:%.*]] = extractelement <4 x float> [[VEC1]], i32 1
+; CHECK: [[VEC2C:%.*]] = insertelement <4 x float> [[VEC2B]], float [[EL2]], i32 2
+; CHECK: [[EL3:%.*]] = extractelement <4 x float> [[VEC1]], i32 2
+; CHECK: [[VEC2D:%.*]] = insertelement <4 x float> [[VEC2C]], float [[EL3]], i32 3
+; CHECK: [[FMUL1:%.*]] = fmul fast <4 x float> [[VEC1]], [[VEC2D]]
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %0
+  %1 = load float, float* %arrayidx, align 4, !tbaa !2
+  %arrayidx2 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx2, align 4, !tbaa !2
+  %mul = fmul fast float %2, %1
+  store float %mul, float* %arrayidx2, align 4, !tbaa !2
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx7 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %3
+  %4 = load float, float* %arrayidx7, align 4, !tbaa !2
+  %mul11 = fmul fast float %4, %1
+  store float %mul11, float* %arrayidx, align 4, !tbaa !2
+  %5 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx17 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %5
+  %6 = load float, float* %arrayidx17, align 4, !tbaa !2
+  %mul21 = fmul fast float %6, %4
+  store float %mul21, float* %arrayidx7, align 4, !tbaa !2
+  %7 = add nuw nsw i64 %indvars.iv, 4
+  %arrayidx27 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %7
+  %8 = load float, float* %arrayidx27, align 4, !tbaa !2
+  %mul31 = fmul fast float %8, %6
+  store float %mul31, float* %arrayidx17, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %arrayidx37 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv.next
+  %9 = load float, float* %arrayidx37, align 4, !tbaa !2
+  %mul41 = fmul fast float %9, %8
+  store float %mul41, float* %arrayidx27, align 4, !tbaa !2
+  %cmp = icmp ult i64 %indvars.iv.next, 31995
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+define void @s116_mod() {
+; CHECK-LABEL: @s116_mod(
+; CHECK: [[BASE:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %0
+; CHECK: [[BASE2:%.*]] = bitcast float* [[BASE]] to <4 x float>*
+; CHECK: [[VEC1:%.*]] = load <4 x float>, <4 x float>* [[BASE2]]
+; CHECK: [[VEC2A:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i32 0
+; CHECK: [[EL1:%.*]] = extractelement <4 x float> [[VEC1]], i32 0
+; CHECK: [[VEC2B:%.*]] = insertelement <4 x float> [[VEC2A]], float [[EL1]], i32 1
+; CHECK: [[EL2:%.*]] = extractelement <4 x float> [[VEC1]], i32 1
+; CHECK: [[VEC2C:%.*]] = insertelement <4 x float> [[VEC2B]], float [[EL2]], i32 2
+; CHECK: [[EL3:%.*]] = extractelement <4 x float> [[VEC1]], i32 2
+; CHECK: [[VEC2D:%.*]] = insertelement <4 x float> [[VEC2C]], float [[EL3]], i32 3
+; CHECK: [[FMUL1:%.*]] = fmul fast <4 x float> [[VEC1]], [[VEC2D]]
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = or i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %0
+  %1 = load float, float* %arrayidx, align 4, !tbaa !2
+  %arrayidx2 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx2, align 16, !tbaa !2
+  %mul = fmul fast float %2, %1
+  store float %mul, float* %arrayidx2, align 16, !tbaa !2
+  %3 = or i64 %indvars.iv, 2
+  %arrayidx7 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %3
+  %4 = load float, float* %arrayidx7, align 8, !tbaa !2
+  %mul11 = fmul fast float %4, %1
+  store float %mul11, float* %arrayidx, align 4, !tbaa !2
+  %5 = or i64 %indvars.iv, 3
+  %arrayidx17 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %5
+  %6 = load float, float* %arrayidx17, align 4, !tbaa !2
+  %mul21 = fmul fast float %6, %4
+  store float %mul21, float* %arrayidx7, align 8, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %arrayidx27 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv.next
+  %7 = load float, float* %arrayidx27, align 16, !tbaa !2
+  %mul31 = fmul fast float %7, %6
+  store float %mul31, float* %arrayidx17, align 4, !tbaa !2
+  %cmp = icmp ult i64 %indvars.iv.next, 31996
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="thunderx2t99" "target-features"="+crc,+crypto,+fp-armv8,+lse,+neon,+rdm,+v8.1a" "unsafe-fp-math"="true" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 7.0.2"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"float", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}