Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5195,40 +5195,29 @@ // FIXME: We should be able to handle f128 as well with a clever lowering. if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasFullFP16()))) { - LLVM_DEBUG( - dbgs() << "Legal fp imm: materialize 0 using the zero register\n"); + LLVM_DEBUG(dbgs() << "Legal " << VT.getEVTString() << " imm value: 0\n"); return true; } - StringRef FPType; bool IsLegal = false; SmallString<128> ImmStrVal; Imm.toString(ImmStrVal); - if (VT == MVT::f64) { - FPType = "f64"; + if (VT == MVT::f64) IsLegal = AArch64_AM::getFP64Imm(Imm) != -1; - } else if (VT == MVT::f32) { - FPType = "f32"; + else if (VT == MVT::f32) IsLegal = AArch64_AM::getFP32Imm(Imm) != -1; - } else if (VT == MVT::f16 && Subtarget->hasFullFP16()) { - FPType = "f16"; + else if (VT == MVT::f16 && Subtarget->hasFullFP16()) IsLegal = AArch64_AM::getFP16Imm(Imm) != -1; - } if (IsLegal) { - LLVM_DEBUG(dbgs() << "Legal " << FPType << " imm value: " << ImmStrVal - << "\n"); + LLVM_DEBUG(dbgs() << "Legal " << VT.getEVTString() + << " imm value: " << ImmStrVal << "\n"); return true; } - if (!FPType.empty()) - LLVM_DEBUG(dbgs() << "Illegal " << FPType << " imm value: " << ImmStrVal - << "\n"); - else - LLVM_DEBUG(dbgs() << "Illegal fp imm " << ImmStrVal - << ": unsupported fp type\n"); - + LLVM_DEBUG(dbgs() << "Illegal " << VT.getEVTString() + << " imm value: " << ImmStrVal << "\n"); return false; } @@ -8353,27 +8342,29 @@ bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { - // Don't use AdvSIMD to implement 16-byte memset. It would have taken one - // instruction to materialize the v2i64 zero and one store (with restrictive - // addressing mode). Just do two i64 store of zero-registers. - bool Fast; const Function &F = MF.getFunction(); - if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && - !F.hasFnAttribute(Attribute::NoImplicitFloat) && - (memOpAlign(SrcAlign, DstAlign, 16) || - (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) - return MVT::f128; + bool CanUseFP = + Subtarget->hasFPARMv8() && !F.hasFnAttribute(Attribute::NoImplicitFloat); + // Only use AdvSIMD to implement memset of 32-byte and above. It would have + // taken one instruction to materialize the v2i64 zero and one store (with + // restrictive addressing mode). Just do i64 stores. + bool IsSmallMemset = IsMemset && Size < 32; + auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { + if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + return true; + bool Fast; + return allowsMisalignedMemoryAccesses(VT, 0, 1, &Fast) && Fast; + }; - if (Size >= 8 && - (memOpAlign(SrcAlign, DstAlign, 8) || - (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast))) + if (CanUseFP && IsMemset && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, 16)) + return MVT::v2i64; + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + return MVT::f128; + if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) return MVT::i64; - - if (Size >= 4 && - (memOpAlign(SrcAlign, DstAlign, 4) || - (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast))) + if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) return MVT::i32; - return MVT::Other; } Index: test/CodeGen/AArch64/arm64-memset-inline.ll =================================================================== --- test/CodeGen/AArch64/arm64-memset-inline.ll +++ test/CodeGen/AArch64/arm64-memset-inline.ll @@ -137,14 +137,12 @@ ret void } -; FIXME These don't pair up because the offset isn't a multiple of 16 bits. x0, however, could be used as a base for a paired store. define void @bzero_40_stack() { ; CHECK-LABEL: bzero_40_stack: -; CHECK: stp xzr, x30, [sp, #40] -; CHECK: movi v0.2d, #0000000000000000 -; CHECK-NEXT: add x0, sp, #8 -; CHECK-NEXT: stur q0, [sp, #24] -; CHECK-NEXT: stur q0, [sp, #8] +; CHECK: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str xzr, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [40 x i8], align 1 %cast = bitcast [40 x i8]* %buf to i8* @@ -167,16 +165,13 @@ ret void } -; FIXME These don't pair up because the offset isn't a multiple of 16 bits. x0, however, could be used as a base for a paired store. define void @bzero_72_stack() { ; CHECK-LABEL: bzero_72_stack: -; CHECK: stp xzr, x30, [sp, #72] ; CHECK: movi v0.2d, #0000000000000000 -; CHECK-NEXT: x0, sp, #8 -; CHECK-NEXT: stur q0, [sp, #56] -; CHECK-NEXT: stur q0, [sp, #40] -; CHECK-NEXT: stur q0, [sp, #24] -; CHECK-NEXT: stur q0, [sp, #8] +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str xzr, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [72 x i8], align 1 %cast = bitcast [72 x i8]* %buf to i8* @@ -310,14 +305,11 @@ ret void } -; FIXME This could use FP ops. define void @memset_32_stack() { ; CHECK-LABEL: memset_32_stack: -; CHECK: mov x8, #-6148914691236517206 +; CHECK: movi v0.16b, #170 ; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x30, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] -; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [32 x i8], align 1 %cast = bitcast [32 x i8]* %buf to i8* @@ -326,14 +318,13 @@ ret void } -; FIXME This could use FP ops. define void @memset_40_stack() { ; CHECK-LABEL: memset_40_stack: ; CHECK: mov x8, #-6148914691236517206 -; CHECK-NEXT: add x0, sp, #8 -; CHECK-NEXT: stp x8, x30, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] +; CHECK-NEXT: movi v0.16b, #170 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str x8, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [40 x i8], align 1 %cast = bitcast [40 x i8]* %buf to i8* @@ -342,16 +333,12 @@ ret void } -; FIXME This could use FP ops. define void @memset_64_stack() { ; CHECK-LABEL: memset_64_stack: -; CHECK: mov x8, #-6148914691236517206 +; CHECK: movi v0.16b, #170 ; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x30, [sp, #56] -; CHECK-NEXT: stp x8, x8, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] -; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [64 x i8], align 1 %cast = bitcast [64 x i8]* %buf to i8* @@ -360,16 +347,14 @@ ret void } -; FIXME This could use FP ops. define void @memset_72_stack() { ; CHECK-LABEL: memset_72_stack: ; CHECK: mov x8, #-6148914691236517206 -; CHECK-NEXT: add x0, sp, #8 -; CHECK-NEXT: stp x8, x30, [sp, #72] -; CHECK-NEXT: stp x8, x8, [sp, #56] -; CHECK-NEXT: stp x8, x8, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] +; CHECK-NEXT: movi v0.16b, #170 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str x8, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [72 x i8], align 1 %cast = bitcast [72 x i8]* %buf to i8* @@ -378,20 +363,14 @@ ret void } -; FIXME This could use FP ops. define void @memset_128_stack() { ; CHECK-LABEL: memset_128_stack: -; CHECK: mov x8, #-6148914691236517206 +; CHECK: movi v0.16b, #170 ; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x30, [sp, #120] -; CHECK-NEXT: stp x8, x8, [sp, #104] -; CHECK-NEXT: stp x8, x8, [sp, #88] -; CHECK-NEXT: stp x8, x8, [sp, #72] -; CHECK-NEXT: stp x8, x8, [sp, #56] -; CHECK-NEXT: stp x8, x8, [sp, #40] -; CHECK-NEXT: stp x8, x8, [sp, #24] -; CHECK-NEXT: stp x8, x8, [sp, #8] -; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: stp q0, q0, [sp, #96] +; CHECK-NEXT: stp q0, q0, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [128 x i8], align 1 %cast = bitcast [128 x i8]* %buf to i8* @@ -400,27 +379,18 @@ ret void } -; FIXME This could use FP ops. define void @memset_256_stack() { ; CHECK-LABEL: memset_256_stack: -; CHECK: mov x8, #-6148914691236517206 -; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: stp x8, x8, [sp, #240] -; CHECK-NEXT: stp x8, x8, [sp, #224] -; CHECK-NEXT: stp x8, x8, [sp, #208] -; CHECK-NEXT: stp x8, x8, [sp, #192] -; CHECK-NEXT: stp x8, x8, [sp, #176] -; CHECK-NEXT: stp x8, x8, [sp, #160] -; CHECK-NEXT: stp x8, x8, [sp, #144] -; CHECK-NEXT: stp x8, x8, [sp, #128] -; CHECK-NEXT: stp x8, x8, [sp, #112] -; CHECK-NEXT: stp x8, x8, [sp, #96] -; CHECK-NEXT: stp x8, x8, [sp, #80] -; CHECK-NEXT: stp x8, x8, [sp, #64] -; CHECK-NEXT: stp x8, x8, [sp, #48] -; CHECK-NEXT: stp x8, x8, [sp, #32] -; CHECK-NEXT: stp x8, x8, [sp, #16] -; CHECK-NEXT: stp x8, x8, [sp] +; CHECK: movi v0.16b, #170 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: stp q0, q0, [sp, #224] +; CHECK-NEXT: stp q0, q0, [sp, #192] +; CHECK-NEXT: stp q0, q0, [sp, #160] +; CHECK-NEXT: stp q0, q0, [sp, #128] +; CHECK-NEXT: stp q0, q0, [sp, #96] +; CHECK-NEXT: stp q0, q0, [sp, #64] +; CHECK-NEXT: stp q0, q0, [sp, #32] +; CHECK-NEXT: stp q0, q0, [sp] ; CHECK-NEXT: bl something %buf = alloca [256 x i8], align 1 %cast = bitcast [256 x i8]* %buf to i8*