diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -629,7 +629,12 @@ AArch64TTIImpl::TTI::MemCmpExpansionOptions AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { TTI::MemCmpExpansionOptions Options; - Options.AllowOverlappingLoads = !ST->requiresStrictAlign(); + if (ST->requiresStrictAlign()) { + // TODO: Add cost modeling for strict align. Misaligned loads expand to + // a bunch of instructions when strict align is enabled. + return Options; + } + Options.AllowOverlappingLoads = true; Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); Options.NumLoadsPerBlock = Options.MaxNumLoads; // TODO: Though vector loads usually perform well on AArch64, in some targets diff --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll --- a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll +++ b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll @@ -11,12 +11,12 @@ ret i1 %ret ; CHECK-LABEL: test_b2: -; CHECK-NOT: bl bcmp +; CHECKN-NOT: bl bcmp ; CHECKN: ldr x ; CHECKN-NEXT: ldr x ; CHECKN-NEXT: ldur x ; CHECKN-NEXT: ldur x -; CHECKS-COUNT-30: ldrb w +; CHECKS: bl bcmp } define i1 @test_b2_align8(i8* align 8 %s1, i8* align 8 %s2) { @@ -26,19 +26,13 @@ ret i1 %ret ; CHECK-LABEL: test_b2_align8: -; CHECK-NOT: bl bcmp +; CHECKN-NOT: bl bcmp ; CHECKN: ldr x ; CHECKN-NEXT: ldr x ; CHECKN-NEXT: ldur x ; CHECKN-NEXT: ldur x -; CHECKS: ldr x -; CHECKS-NEXT: ldr x -; CHECKS-NEXT: ldr w -; CHECKS-NEXT: ldr w -; CHECKS-NEXT: ldrh w -; CHECKS-NEXT: ldrh w -; CHECKS-NEXT: ldrb w -; CHECKS-NEXT: ldrb w +; TODO: Four loads should be within the limit, but the heuristic isn't implemented. +; CHECKS: bl bcmp } define i1 @test_bs(i8* %s1, i8* %s2) optsize {