Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h @@ -498,12 +498,12 @@ CallingConv::ID CallConv, bool isVarArg) const override; private: - bool isExtFreeImpl(const Instruction *Ext) const override; - /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. const AArch64Subtarget *Subtarget; + bool isExtFreeImpl(const Instruction *Ext) const override; + void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -579,11 +579,17 @@ setTargetDAGCombine(ISD::GlobalAddress); - MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; + // In case of strict alignment, avoid an excessive number of byte wide stores. + MaxStoresPerMemsetOptSize = 8; + MaxStoresPerMemset = Subtarget->requiresStrictAlign() + ? MaxStoresPerMemsetOptSize : 32; + MaxGluedStoresPerMemcpy = 4; + MaxStoresPerMemcpyOptSize = 4; + MaxStoresPerMemcpy = Subtarget->requiresStrictAlign() + ? MaxStoresPerMemcpyOptSize : 16; - MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; - MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; + MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; setStackPointerRegisterToSaveRestore(AArch64::SP); Index: llvm/trunk/test/CodeGen/AArch64/arm64-memset-to-bzero.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-memset-to-bzero.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-memset-to-bzero.ll @@ -1,14 +1,14 @@ ; RUN: llc %s -mtriple=arm64-apple-darwin -o - | \ -; RUN: FileCheck --check-prefix=CHECK-DARWIN --check-prefix=CHECK %s +; RUN: FileCheck --check-prefixes=CHECK,CHECK-DARWIN %s ; RUN: llc %s -mtriple=arm64-linux-gnu -o - | \ -; RUN: FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s +; RUN: FileCheck --check-prefixes=CHECK,CHECK-LINUX %s ; ARM64: Calls to bzero() replaced with calls to memset() ; CHECK-LABEL: fct1: ; For small size (<= 256), we do not change memset to bzero. ; CHECK-DARWIN: {{b|bl}} _memset ; CHECK-LINUX: {{b|bl}} memset -define void @fct1(i8* nocapture %ptr) { +define void @fct1(i8* nocapture %ptr) minsize { entry: tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i1 false) ret void @@ -20,7 +20,7 @@ ; When the size is bigger than 256, change into bzero. ; CHECK-DARWIN: {{b|bl}} _bzero ; CHECK-LINUX: {{b|bl}} memset -define void @fct2(i8* nocapture %ptr) { +define void @fct2(i8* nocapture %ptr) minsize { entry: tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i1 false) ret void @@ -30,7 +30,7 @@ ; For unknown size, change to bzero. ; CHECK-DARWIN: {{b|bl}} _bzero ; CHECK-LINUX: {{b|bl}} memset -define void @fct3(i8* nocapture %ptr, i32 %unknown) { +define void @fct3(i8* nocapture %ptr, i32 %unknown) minsize { entry: %conv = sext i32 %unknown to i64 tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i1 false) @@ -41,7 +41,7 @@ ; Size <= 256, no change. ; CHECK-DARWIN: {{b|bl}} _memset ; CHECK-LINUX: {{b|bl}} memset -define void @fct4(i8* %ptr) { +define void @fct4(i8* %ptr) minsize { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp) @@ -56,7 +56,7 @@ ; Size > 256, change. ; CHECK-DARWIN: {{b|bl}} _bzero ; CHECK-LINUX: {{b|bl}} memset -define void @fct5(i8* %ptr) { +define void @fct5(i8* %ptr) minsize { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp) @@ -67,7 +67,7 @@ ; Size = unknown, change. ; CHECK-DARWIN: {{b|bl}} _bzero ; CHECK-LINUX: {{b|bl}} memset -define void @fct6(i8* %ptr, i32 %unknown) { +define void @fct6(i8* %ptr, i32 %unknown) minsize { entry: %conv = sext i32 %unknown to i64 %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -82,7 +82,7 @@ ; memset with something that is not a zero, no change. ; CHECK-DARWIN: {{b|bl}} _memset ; CHECK-LINUX: {{b|bl}} memset -define void @fct7(i8* %ptr) { +define void @fct7(i8* %ptr) minsize { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp) @@ -93,7 +93,7 @@ ; memset with something that is not a zero, no change. ; CHECK-DARWIN: {{b|bl}} _memset ; CHECK-LINUX: {{b|bl}} memset -define void @fct8(i8* %ptr) { +define void @fct8(i8* %ptr) minsize { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp) @@ -104,7 +104,7 @@ ; memset with something that is not a zero, no change. ; CHECK-DARWIN: {{b|bl}} _memset ; CHECK-LINUX: {{b|bl}} memset -define void @fct9(i8* %ptr, i32 %unknown) { +define void @fct9(i8* %ptr, i32 %unknown) minsize { entry: %conv = sext i32 %unknown to i64 %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) Index: llvm/trunk/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll @@ -1,14 +1,42 @@ ; RUN: llc -mtriple=arm64-apple-ios -mattr=+strict-align < %s | FileCheck %s -; Small (16-bytes here) unaligned memcpys should stay memcpy calls if +; Small (16 bytes here) unaligned memcpy() should be a function call if ; strict-alignment is turned on. define void @t0(i8* %out, i8* %in) { ; CHECK-LABEL: t0: -; CHECK: orr w2, wzr, #0x10 -; CHECK-NEXT: bl _memcpy +; CHECK: orr w2, wzr, #0x10 +; CHECK-NEXT: bl _memcpy entry: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 16, i1 false) ret void } +; Small (16 bytes here) aligned memcpy() should be inlined even if +; strict-alignment is turned on. +define void @t1(i8* align 8 %out, i8* align 8 %in) { +; CHECK-LABEL: t1: +; CHECK: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x1] +; CHECK-NEXT: stp x{{[0-9]+}}, x{{[0-9]+}}, [x0] +entry: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %out, i8* align 8 %in, i64 16, i1 false) + ret void +} + +; Tiny (4 bytes here) unaligned memcpy() should be inlined with byte sized +; loads and stores if strict-alignment is turned on. +define void @t2(i8* %out, i8* %in) { +; CHECK-LABEL: t2: +; CHECK: ldrb w{{[0-9]+}}, [x1, #3] +; CHECK-NEXT: ldrb w{{[0-9]+}}, [x1, #2] +; CHECK-NEXT: ldrb w{{[0-9]+}}, [x1, #1] +; CHECK-NEXT: ldrb w{{[0-9]+}}, [x1] +; CHECK-NEXT: strb w{{[0-9]+}}, [x0, #3] +; CHECK-NEXT: strb w{{[0-9]+}}, [x0, #2] +; CHECK-NEXT: strb w{{[0-9]+}}, [x0, #1] +; CHECK-NEXT: strb w{{[0-9]+}}, [x0] +entry: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 4, i1 false) + ret void +} + declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)