Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -85,6 +85,7 @@ bool HasArithmeticCbzFusion = false; bool DisableLatencySchedHeuristic = false; bool UseRSqrt = false; + bool UseUnalignedQuadwordStorePenalty = true; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; @@ -196,6 +197,9 @@ bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; } bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } bool useRSqrt() const { return UseRSqrt; } + bool useUnalignedQuadwordStorePenalty() const { + return UseUnalignedQuadwordStorePenalty; + } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const { return VectorInsertExtractBaseCost; Index: lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- lib/Target/AArch64/AArch64Subtarget.cpp +++ lib/Target/AArch64/AArch64Subtarget.cpp @@ -71,6 +71,7 @@ break; case Falkor: MaxInterleaveFactor = 4; + UseUnalignedQuadwordStorePenalty = false; break; case Kryo: MaxInterleaveFactor = 4; @@ -79,6 +80,7 @@ PrefetchDistance = 740; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 11; + UseUnalignedQuadwordStorePenalty = false; break; case Vulcan: MaxInterleaveFactor = 4; Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -467,7 +467,8 @@ unsigned Alignment, unsigned AddressSpace) { std::pair LT = TLI->getTypeLegalizationCost(DL, Src); - if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && + if (ST->useUnalignedQuadwordStorePenalty() && Opcode == Instruction::Store && + Src->isVectorTy() && Alignment != 16 && Src->getVectorElementType()->isIntegerTy(64)) { // Unaligned stores are extremely inefficient. We don't split // unaligned v2i64 stores because the negative impact that has shown in Index: test/Analysis/CostModel/AArch64/falkor.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AArch64/falkor.ll @@ -0,0 +1,15 @@ +; RUN: opt < %s -cost-model -analyze -mcpu=falkor | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: memoryOpCost +define void @memoryOpCost(<2 x i64> %a, <2 x i64>* %ptr) { + + ; Disable the unaligned quadword store penalty for falkor. + ; + ; CHECK: cost of 1 {{.*}} store <2 x i64> %a, <2 x i64>* %ptr, align 8 + store <2 x i64> %a, <2 x i64>* %ptr, align 8 + + ret void +} Index: test/Analysis/CostModel/AArch64/kryo.ll =================================================================== --- test/Analysis/CostModel/AArch64/kryo.ll +++ test/Analysis/CostModel/AArch64/kryo.ll @@ -24,3 +24,14 @@ ret void } + +; CHECK-LABEL: memoryOpCost +define void @memoryOpCost(<2 x i64> %a, <2 x i64>* %ptr) { + + ; Disable the unaligned quadword store penalty for kryo. + ; + ; CHECK: cost of 1 {{.*}} store <2 x i64> %a, <2 x i64>* %ptr, align 8 + store <2 x i64> %a, <2 x i64>* %ptr, align 8 + + ret void +}