Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h @@ -93,6 +93,8 @@ bool isLegalMaskedScatter(Type *DataType); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; + + bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, unsigned Alignment, unsigned AddressSpace); Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1767,3 +1767,10 @@ // correct. return (CallerBits & CalleeBits) == CalleeBits; } + +bool X86TTIImpl::enableInterleavedAccessVectorization() { + // TODO: We expect this to be beneficial regardless of arch, + // but there are currently some unexplained performance artifacts on Atom. + // As a temporary solution, disable on Atom. + return !(ST->isAtom() || ST->isSLM()); +} Index: llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -67,7 +67,7 @@ %t2 = load float, float* %arrayidx3, align 4 %add = fadd fast float %t1, %s.02 %add4 = fadd fast float %add, %t2 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 32 %cmp1 = icmp slt i64 %indvars.iv.next, %t0 br i1 %cmp1, label %for, label %loopexit Index: llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -85,7 +85,7 @@ ; The source code ;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) { ; -; for (int i=0; i 0) { ; out[i] = in[i].b + (float) 0.5; ; } @@ -95,9 +95,9 @@ %struct.In = type { float, float } ;AVX512-LABEL: @foo2 -;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1 +;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1 ;AVX512: llvm.masked.gather.v16f32 -;AVX512: llvm.masked.store.v16f32 +;AVX512: llvm.masked.scatter.v16f32 ;AVX512: ret void define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { entry: @@ -147,7 +147,7 @@ for.inc: ; preds = %if.end %9 = load i32, i32* %i, align 4 - %inc = add nsw i32 %9, 1 + %inc = add nsw i32 %9, 16 store i32 %inc, i32* %i, align 4 br label %for.cond @@ -162,7 +162,7 @@ ;}; ;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) { ; -; for (int i=0; i 0) { ; out[i].b = in[i].b + (float) 0.5; ; } @@ -170,10 +170,10 @@ ;} ;AVX512-LABEL: @foo3 -;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1 +;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1 ;AVX512: llvm.masked.gather.v16f32 ;AVX512: fadd <16 x float> -;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1 +;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> {{.*}}, i32 1 ;AVX512: llvm.masked.scatter.v16f32 ;AVX512: ret void @@ -226,7 +226,7 @@ for.inc: ; preds = %if.end %9 = load i32, i32* %i, align 4 - %inc = add nsw i32 %9, 1 + %inc = add nsw i32 %9, 16 store i32 %inc, i32* %i, align 4 br label %for.cond Index: llvm/trunk/test/Transforms/LoopVectorize/X86/interleaving.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/interleaving.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/interleaving.ll @@ -0,0 +1,35 @@ +; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL +; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=ATOM + +; NORMAL-LABEL: foo +; NORMAL: %[[WIDE:.*]] = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; NORMAL: %[[STRIDED1:.*]] = shufflevector <8 x i32> %[[WIDE]], <8 x i32> undef, <4 x i32> +; NORMAL: %[[STRIDED2:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; NORMAL: add nsw <4 x i32> %[[STRIDED2]], %[[STRIDED1]] + +; ATOM-LABEL: foo +; ATOM: load i32 +; ATOM: load i32 +; ATOM: store i32 +define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %0 + %1 = load i32, i32* %arrayidx, align 4 + %2 = or i64 %0, 1 + %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %2 + %3 = load i32, i32* %arrayidx3, align 4 + %add4 = add nsw i32 %3, %1 + %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %add4, i32* %arrayidx6, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} Index: llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -341,7 +341,7 @@ ; ;void foo4(double *A, double *B, int *trigger) { ; -; for (int i=0; i<10000; i++) { +; for (int i=0; i<10000; i += 16) { ; if (trigger[i] < 100) { ; A[i] = B[i*2] + trigger[i]; << non-cosecutive access ; } @@ -410,7 +410,7 @@ for.inc: ; preds = %if.end %12 = load i32, i32* %i, align 4 - %inc = add nsw i32 %12, 1 + %inc = add nsw i32 %12, 16 store i32 %inc, i32* %i, align 4 br label %for.cond