Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8342,27 +8342,30 @@ bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { - // Don't use AdvSIMD to implement 16-byte memset. It would have taken one - // instruction to materialize the v2i64 zero and one store (with restrictive - // addressing mode). Just do two i64 store of zero-registers. - bool Fast; const Function &F = MF.getFunction(); - if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && - !F.hasFnAttribute(Attribute::NoImplicitFloat) && - (memOpAlign(SrcAlign, DstAlign, 16) || - (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) - return MVT::f128; + bool CanImplicitFloat = !F.hasFnAttribute(Attribute::NoImplicitFloat); + bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; + bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; + // Only use AdvSIMD to implement memset of 32-byte and above. It would have + // taken one instruction to materialize the v2i64 zero and one store (with + // restrictive addressing mode). Just do i64 stores. + bool IsSmallMemset = IsMemset && Size < 32; + auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { + if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + return true; + bool Fast; + return allowsMisalignedMemoryAccesses(VT, 0, 1, &Fast) && Fast; + }; - if (Size >= 8 && - (memOpAlign(SrcAlign, DstAlign, 8) || - (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast))) + if (CanUseNEON && IsMemset && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, 16)) + return MVT::v2i64; + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + return MVT::f128; + if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) return MVT::i64; - - if (Size >= 4 && - (memOpAlign(SrcAlign, DstAlign, 4) || - (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast))) + if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) return MVT::i32; - return MVT::Other; } Index: llvm/trunk/test/CodeGen/AArch64/arm64-memset-inline.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-memset-inline.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-memset-inline.ll @@ -137,14 +137,12 @@ ret void } -; FIXME These don't pair up because the offset isn't a multiple of 16 bits. x0, however, could be used as a base for a paired store. define void @bzero_40_stack() { ; CHECK-LABEL: bzero_40_stack: -; CHECK: stp xzr, x30, [sp, #40] -; CHECK: movi v0.2d, #0000000000000000 -; CHECK-NEXT: add x0, sp, #8 -; CHECK-NEXT: stur q0, [sp, #24] -; CHECK-NEXT: stur q0, [sp, #8] +; CHECK: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str xzr, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [40 x i8], align 1 %cast = bitcast [40 x i8]* %buf to i8* @@ -167,16 +165,13 @@ ret void } -; FIXME These don't pair up because the offset isn't a multiple of 16 bits. x0, however, could be used as a base for a paired store. define void @bzero_72_stack() { ; CHECK-LABEL: bzero_72_stack: -; CHECK: stp xzr, x30, [sp, #72] ; CHECK: movi v0.2d, #0000000000000000 -; CHECK-NEXT: x0, sp, #8 -; CHECK-NEXT: stur q0, [sp, #56] -; CHECK-NEXT: stur q0, [sp, #40] -; CHECK-NEXT: stur q0, [sp, #24] -; CHECK-NEXT: stur q0, [sp, #8] +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str xzr, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [72 x i8], align 1 %cast = bitcast [72 x i8]* %buf to i8* @@ -310,14 +305,11 @@ ret void } -; FIXME This could use FP ops. define void @memset_32_stack() { ; CHECK-LABEL: memset_32_stack: -; CHECK: mov x8, #-6148914691236517206 +; CHECK: movi v0.16b, #170 ; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x30, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] -; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [32 x i8], align 1 %cast = bitcast [32 x i8]* %buf to i8* @@ -326,14 +318,13 @@ ret void } -; FIXME This could use FP ops. define void @memset_40_stack() { ; CHECK-LABEL: memset_40_stack: ; CHECK: mov x8, #-6148914691236517206 -; CHECK-NEXT: add x0, sp, #8 -; CHECK-NEXT: stp x8, x30, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] +; CHECK-NEXT: movi v0.16b, #170 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str x8, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [40 x i8], align 1 %cast = bitcast [40 x i8]* %buf to i8* @@ -342,16 +333,12 @@ ret void } -; FIXME This could use FP ops. define void @memset_64_stack() { ; CHECK-LABEL: memset_64_stack: -; CHECK: mov x8, #-6148914691236517206 +; CHECK: movi v0.16b, #170 ; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x30, [sp, #56] -; CHECK-NEXT: stp x8, x8, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] -; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [64 x i8], align 1 %cast = bitcast [64 x i8]* %buf to i8* @@ -360,16 +347,14 @@ ret void } -; FIXME This could use FP ops. define void @memset_72_stack() { ; CHECK-LABEL: memset_72_stack: ; CHECK: mov x8, #-6148914691236517206 -; CHECK-NEXT: add x0, sp, #8 -; CHECK-NEXT: stp x8, x30, [sp, #72] -; CHECK-NEXT: stp x8, x8, [sp, #56] -; CHECK-NEXT: stp x8, x8, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] +; CHECK-NEXT: movi v0.16b, #170 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str x8, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [72 x i8], align 1 %cast = bitcast [72 x i8]* %buf to i8* @@ -378,20 +363,14 @@ ret void } -; FIXME This could use FP ops. define void @memset_128_stack() { ; CHECK-LABEL: memset_128_stack: -; CHECK: mov x8, #-6148914691236517206 +; CHECK: movi v0.16b, #170 ; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x30, [sp, #120] -; CHECK-NEXT: stp x8, x8, [sp, #104] -; CHECK-NEXT: stp x8, x8, [sp, #88] -; CHECK-NEXT: stp x8, x8, [sp, #72] -; CHECK-NEXT: stp x8, x8, [sp, #56] -; CHECK-NEXT: stp x8, x8, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] -; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: stp q0, q0, [sp, #96] +; CHECK-NEXT: stp q0, q0, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [128 x i8], align 1 %cast = bitcast [128 x i8]* %buf to i8* @@ -400,27 +379,18 @@ ret void } -; FIXME This could use FP ops. define void @memset_256_stack() { ; CHECK-LABEL: memset_256_stack: -; CHECK: mov x8, #-6148914691236517206 -; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x8, [sp, #240] -; CHECK-NEXT: stp x8, x8, [sp, #224] -; CHECK-NEXT: stp x8, x8, [sp, #208] -; CHECK-NEXT: stp x8, x8, [sp, #192] -; CHECK-NEXT: stp x8, x8, [sp, #176] -; CHECK-NEXT: stp x8, x8, [sp, #160] -; CHECK-NEXT: stp x8, x8, [sp, #144] -; CHECK-NEXT: stp x8, x8, [sp, #128] -; CHECK-NEXT: stp x8, x8, [sp, #112] -; CHECK-NEXT: stp x8, x8, [sp, #96] -; CHECK-NEXT: stp x8, x8, [sp, #80] -; CHECK-NEXT: stp x8, x8, [sp, #64] -; CHECK-NEXT: stp x8, x8, [sp, #48] -; CHECK-NEXT: stp x8, x8, [sp, #32] -; CHECK-NEXT: stp x8, x8, [sp, #16] -; CHECK-NEXT: stp x8, x8, [sp] +; CHECK: movi v0.16b, #170 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: stp q0, q0, [sp, #224] +; CHECK-NEXT: stp q0, q0, [sp, #192] +; CHECK-NEXT: stp q0, q0, [sp, #160] +; CHECK-NEXT: stp q0, q0, [sp, #128] +; CHECK-NEXT: stp q0, q0, [sp, #96] +; CHECK-NEXT: stp q0, q0, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [256 x i8], align 1 %cast = bitcast [256 x i8]* %buf to i8*