Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -171,6 +171,10 @@ bool IsPairwiseForm); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + + bool shouldMaximizeVectorBandwidth(bool OptSize) const { + return !OptSize; + } /// @} }; Index: test/Transforms/LoopVectorize/AArch64/aarch64-trunc-vec.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/AArch64/aarch64-trunc-vec.ll @@ -0,0 +1,54 @@ +; RUN: opt < %s -loop-vectorize -mtriple=aarch64-none-linux-gnu -mattr=+neon -S | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Originl source code: +; +; void store_i32_to_i8 (const int *src, int width, unsigned char *dst) +; { +; for (int i = 0; i < width; i++) { +; *dst++ = *src++; +; } +;} + +; Function Attrs: norecurse nounwind +define void @store_i32_to_i8(i32* noalias nocapture readonly %src, i32 %width, i8* nocapture %dst) { +; CHECK-LABEL: store_i32_to_i8 +; CHECK-LABEL: vector.body: +; CHECK: [[WIDE_LOAD0:%.*]] = load <16 x i32>, <16 x i32>* {{%.*}}, align 4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* {{%.*}}, i32 16 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* +; CHECK: [[WIDE_LOAD1:%.*]] = load <16 x i32>, <16 x i32>* {{%.*}}, align 4 +; CHECK-NEXT: [[TRUNC1:%.*]] = trunc <16 x i32> [[WIDE_LOAD0]] to <16 x i8> +; CHECK-NEXT: [[TRUNC2:%.*]] = trunc <16 x i32> [[WIDE_LOAD1]] to <16 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, i8* {{%.*}}, i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TRUNC1]], <16 x i8>* [[TMP17]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* {{%.*}}, i32 16 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TRUNC2]], <16 x i8>* [[TMP19]], align 1 +entry: + %cmp4 = icmp sgt i32 %width, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %dst.addr.06 = phi i8* [ %incdec.ptr1, %for.body ], [ %dst, %for.body.preheader ] + %src.addr.05 = phi i32* [ %incdec.ptr, %for.body ], [ %src, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i32, i32* %src.addr.05, i64 1 + %0 = load i32, i32* %src.addr.05, align 4 + %conv = trunc i32 %0 to i8 + %incdec.ptr1 = getelementptr inbounds i8, i8* %dst.addr.06, i64 1 + store i8 %conv, i8* %dst.addr.06, align 1 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %width + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -88,9 +88,9 @@ } ; CHECK-LABEL: @add_c( -; CHECK: load <8 x i8>, <8 x i8>* -; CHECK: add <8 x i16> -; CHECK: store <8 x i16> +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: add <16 x i16> +; CHECK: store <16 x i16> ; Function Attrs: nounwind define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { entry: @@ -116,9 +116,9 @@ } ; CHECK-LABEL: @add_d( -; CHECK: load <4 x i16> -; CHECK: add nsw <4 x i32> -; CHECK: store <4 x i32> +; CHECK: load <8 x i16> +; CHECK: add nsw <8 x i32> +; CHECK: store <8 x i32> define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { entry: %cmp7 = icmp sgt i32 %len, 0 @@ -187,16 +187,16 @@ } ; CHECK-LABEL: @add_f -; CHECK: load <8 x i16> -; CHECK: trunc <8 x i16> -; CHECK: shl <8 x i8> -; CHECK: add <8 x i8> -; CHECK: or <8 x i8> -; CHECK: mul <8 x i8> -; CHECK: and <8 x i8> -; CHECK: xor <8 x i8> -; CHECK: mul <8 x i8> -; CHECK: store <8 x i8> +; CHECK: load <16 x i16> +; CHECK: trunc <16 x i16> +; CHECK: shl <16 x i8> +; CHECK: add <16 x i8> +; CHECK: or <16 x i8> +; CHECK: mul <16 x i8> +; CHECK: and <16 x i8> +; CHECK: xor <16 x i8> +; CHECK: mul <16 x i8> +; CHECK: store <16 x i8> define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { entry: %cmp.32 = icmp sgt i32 %len, 0 Index: test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +++ test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -123,16 +123,16 @@ ; } ; ; CHECK: vector.body: -; CHECK: phi <8 x i16> -; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8> -; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16> -; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8> -; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16> -; CHECK: add <8 x i16> -; CHECK: add <8 x i16> +; CHECK: phi <16 x i16> +; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8> +; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16> +; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8> +; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16> +; CHECK: add <16 x i16> +; CHECK: add <16 x i16> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {