Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1185,7 +1185,7 @@ /// to replace a call to llvm.memset. The value is set by the target at the /// performance threshold for such a replacement. If OptSize is true, /// return the limit for functions that have OptSize attribute. - unsigned getMaxStoresPerMemset(bool OptSize) const { + virtual unsigned getMaxStoresPerMemset(bool OptSize) const { return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset; } @@ -1195,7 +1195,7 @@ /// to replace a call to llvm.memcpy. The value is set by the target at the /// performance threshold for such a replacement. If OptSize is true, /// return the limit for functions that have OptSize attribute. - unsigned getMaxStoresPerMemcpy(bool OptSize) const { + virtual unsigned getMaxStoresPerMemcpy(bool OptSize) const { return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; } @@ -1214,7 +1214,7 @@ /// to replace a call to memcmp. The value is set by the target at the /// performance threshold for such a replacement. If OptSize is true, /// return the limit for functions that have OptSize attribute. - unsigned getMaxExpandSizeMemcmp(bool OptSize) const { + virtual unsigned getMaxExpandSizeMemcmp(bool OptSize) const { return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp; } @@ -1236,7 +1236,7 @@ /// to replace a call to llvm.memmove. The value is set by the target at the /// performance threshold for such a replacement. If OptSize is true, /// return the limit for functions that have OptSize attribute. - unsigned getMaxStoresPerMemmove(bool OptSize) const { + virtual unsigned getMaxStoresPerMemmove(bool OptSize) const { return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove; } Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -238,6 +238,10 @@ class AArch64TargetMachine; class AArch64TargetLowering : public TargetLowering { + /// Keep a pointer to the AArch64Subtarget around so that we can + /// make the right decision when generating code for different targets. + const AArch64Subtarget *Subtarget; + public: explicit AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI); @@ -266,6 +270,10 @@ unsigned Align = 1, bool *Fast = nullptr) const override; + // In case of strict alignment, avoid an excessive number of byte wide stores. + unsigned getMaxStoresPerMemset(bool OptSize) const override; + unsigned getMaxStoresPerMemcpy(bool OptSize) const override; + /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -500,10 +508,6 @@ private: bool isExtFreeImpl(const Instruction *Ext) const override; - /// Keep a pointer to the AArch64Subtarget around so that we can - /// make the right decision when generating code for different targets. - const AArch64Subtarget *Subtarget; - void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1065,6 +1065,19 @@ return true; } +// In case of strict alignment, avoid an excessive number of byte wide stores +// for memset() and... +unsigned AArch64TargetLowering::getMaxStoresPerMemset(bool OptSize) const { + return OptSize || Subtarget->requiresStrictAlign() + ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset; +} + +// memcpy(). +unsigned AArch64TargetLowering::getMaxStoresPerMemcpy(bool OptSize) const { + return OptSize || Subtarget->requiresStrictAlign() + ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { Index: llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll +++ llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=arm64-apple-ios -mattr=+strict-align < %s | FileCheck %s -; Small (16-bytes here) unaligned memcpys should stay memcpy calls if +; Small (16 bytes here) unaligned memcpy should stay memcpy call if ; strict-alignment is turned on. define void @t0(i8* %out, i8* %in) { ; CHECK-LABEL: t0: @@ -11,4 +11,15 @@ ret void } +; Tiny (4 bytes here) unaligned memcpy could be inlined with byte sized +; loads and stores if strict-alignment is turned on. +define void @t1(i8* %out, i8* %in) { +; CHECK-LABEL: t1: +; CHECK: ldrb w{{[0-9]+}}, [x1] +; CHECK: strb w{{[0-9]+}}, [x0] +entry: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 4, i1 false) + ret void +} + declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)