This patch enables unaligned memory accesses of vector types on AArch64 back end. This should boost vectorized code performance. Currently we mimic ARMv7's behavior on the same API.
Details
Diff Detail
Event Timeline
This looks fine to me. Tim.
See below for rebased patch and updated test cases.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by
The Linux Foundation
From e1c992f12bfe168a38d5885c91984a9003f07eb7 Mon Sep 17 00:00:00 2001
From: Zhaoshi Zheng <zhaoshiz@codeaurora.org>
Date: Wed, 19 Mar 2014 18:32:22 -0700
Subject: [PATCH 1/3] [AArch64] Implement allowsUnalignedMemoryAccesses()
lib/Target/AArch64/AArch64ISelLowering.cpp | 44 ++++++
lib/Target/AArch64/AArch64ISelLowering.h | 6 +
lib/Target/AArch64/AArch64Subtarget.cpp | 34 +++++
lib/Target/AArch64/AArch64Subtarget.h | 7 +
test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll | 172
+++++++++++++++++++++++
5 files changed, 263 insertions(+)
create mode 100644 test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp
b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 236d5ec..1e79894 100644
- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4412,6 +4412,50 @@
AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
return false;
}
+
+bool AArch64TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+ unsigned
AddrSpace,
+ bool *Fast)
const {
+ const AArch64Subtarget *Subtarget = getSubtarget();
+ The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+ bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ return false;
+ Scalar types
+ case MVT::i8: case MVT::i16:
+ case MVT::i32: case MVT::i64:
+ case MVT::f32: case MVT::f64: {
+ Unaligned access can use (for example) LRDB, LRDH, LDRW
+ if (AllowsUnaligned) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
+ return false;
+ }
+ 64-bit vector types
+ case MVT::v8i8: case MVT::v4i16:
+ case MVT::v2i32: case MVT::v1i64:
+ case MVT::v2f32: case MVT::v1f64:
+ 128-bit vector types
+ case MVT::v16i8: case MVT::v8i16:
+ case MVT::v4i32: case MVT::v2i64:
+ case MVT::v4f32: case MVT::v2f64: {
+ For any little-endian targets with neon, we can support unaligned
+ load/store of V registers using ld1/st1.
+ A big-endian target may also explicitly support unaligned accesses
+ if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
+ return false;
+ }
+ }
+}
+
// Check whether a shuffle_vector could be presented as concat_vector.
bool AArch64TargetLowering::isConcatVector(SDValue Op, SelectionDAG &DAG,
SDValue V0, SDValue V1,
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h
b/lib/Target/AArch64/AArch64ISelLowering.h
index f83c1ab..154c1d7 100644
- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -355,6 +355,12 @@ public:
/// expanded to fmul + fadd. virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
+ / allowsUnalignedMemoryAccesses - Returns true if the target allows
+ / unaligned memory accesses of the specified type. Returns whether it
+ /// is "fast" by reference in the second argument.
+ virtual bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+ bool *Fast) const;
+
ConstraintType getConstraintType(const std::string &Constraint) const; ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info,
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp
b/lib/Target/AArch64/AArch64Subtarget.cpp
index 9140bbd..53cdf30 100644
- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -25,6 +25,25 @@
using namespace llvm;
+enum AlignMode {
+ DefaultAlign,
+ StrictAlign,
+ NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+ cl::Hidden, cl::init(DefaultAlign),
+ cl::values(
+ clEnumValN(DefaultAlign, "aarch64-default-align",
+ "Generate unaligned accesses only on hardware/OS "
+ "combinations that are known to support them"),
+ clEnumValN(StrictAlign, "aarch64-strict-align",
+ "Disallow all unaligned memory accesses"),
+ clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
+ "Allow unaligned memory accesses"),
+ clEnumValEnd));
+
// Pin the vtable to this file.
void AArch64Subtarget::anchor() {}
@@ -39,6 +58,8 @@ AArch64Subtarget::AArch64Subtarget(StringRef TT,
StringRef CPU, StringRef FS,
void AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU,
StringRef FS) {
+ AllowsUnalignedMem = false;
+
if (CPU.empty()) CPUString = "generic";
@@ -52,6 +73,19 @@ void
AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU,
} ParseSubtargetFeatures(CPU, FullFS);
+
+ switch (Align) {
+ case DefaultAlign:
+ // Linux targets support unaligned accesses on AARCH64
+ AllowsUnalignedMem = isTargetLinux();
+ break;
+ case StrictAlign:
+ AllowsUnalignedMem = false;
+ break;
+ case NoStrictAlign:
+ AllowsUnalignedMem = true;
+ break;
+ }
}
bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV,
diff --git a/lib/Target/AArch64/AArch64Subtarget.h
b/lib/Target/AArch64/AArch64Subtarget.h
index 68c6c4b..45e5a5e 100644
- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -38,6 +38,11 @@ protected:
bool HasNEON; bool HasCrypto;
+ / AllowsUnalignedMem - If true, the subtarget allows unaligned memory
+ / accesses for some types. For details, see
+ /// AArch64TargetLowering::allowsUnalignedMemoryAccesses().
+ bool AllowsUnalignedMem;
+
/// TargetTriple - What processor and OS we're targeting. Triple TargetTriple;
@@ -74,6 +79,8 @@ public:
bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; }
+ bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
+
bool isLittle() const { return IsLittleEndian; } const std::string & getCPUString() const { return CPUString; }
diff --git a/test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll
b/test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll
new file mode 100644
index 0000000..2e3f7bf
- /dev/null
+++ b/test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll
@@ -0,0 +1,172 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon -o - |
FileCheck %s
+; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -o - |
FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -aarch64-no-strict-align
-mattr=+neon -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu
-aarch64-no-strict-align -mattr=+neon -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -aarch64-strict-align
-mattr=+neon -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -aarch64-strict-align
-mattr=+neon -o - | FileCheck %s --check-prefix=BE-STRICT-ALIGN
+
+;; Check element-aligned 128-bit vector load/store - integer
+define <16 x i8> @qwordint (<16 x i8>* %head.v16i8, <8 x i16>*
%head.v8i16, <4 x i32>* %head.v4i32, <2 x i64>* %head.v2i64,
+ <16 x i8>* %tail.v16i8, <8 x i16>*
%tail.v8i16, <4 x i32>* %tail.v4i32, <2 x i64>* %tail.v2i64) {
+; CHECK-LABEL: qwordint
+; CHECK: ld1 {v0.16b}, [x0]
+; CHECK: ld1 {v1.8h}, [x1]
+; CHECK: ld1 {v2.4s}, [x2]
+; CHECK: ld1 {v3.2d}, [x3]
+; CHECK: st1 {v0.16b}, [x4]
+; CHECK: st1 {v1.8h}, [x5]
+; CHECK: st1 {v2.4s}, [x6]
+; CHECK: st1 {v3.2d}, [x7]
+; BE-STRICT-ALIGN-LABEL: qwordint
+; BE-STRICT-ALIGN: ldrb
+; BE-STRICT-ALIGN: ldrh
+; BE-STRICT-ALIGN: ldr
+; BE-STRICT-ALIGN: ldr
+; BE-STRICT-ALIGN: strb
+; BE-STRICT-ALIGN: strh
+; BE-STRICT-ALIGN: str
+; BE-STRICT-ALIGN: str
+entry:
+ %val.v16i8 = load <16 x i8>* %head.v16i8, align 1
+ %val.v8i16 = load <8 x i16>* %head.v8i16, align 2
+ %val.v4i32 = load <4 x i32>* %head.v4i32, align 4
+ %val.v2i64 = load <2 x i64>* %head.v2i64, align 8
+ store <16 x i8> %val.v16i8, <16 x i8>* %tail.v16i8, align 1
+ store <8 x i16> %val.v8i16, <8 x i16>* %tail.v8i16, align 2
+ store <4 x i32> %val.v4i32, <4 x i32>* %tail.v4i32, align 4
+ store <2 x i64> %val.v2i64, <2 x i64>* %tail.v2i64, align 8
+ ret <16 x i8> %val.v16i8
+}
+
+;; Check element-aligned 128-bit vector load/store - floating point
+define <4 x float> @qwordfloat (<4 x float>* %head.v4f32, <2 x double>*
%head.v2f64,
+ <4 x float>* %tail.v4f32, <2 x double>*
%tail.v2f64) {
+; CHECK-LABEL: qwordfloat
+; CHECK: ld1 {v0.4s}, [x0]
+; CHECK: ld1 {v1.2d}, [x1]
+; CHECK: st1 {v0.4s}, [x2]
+; CHECK: st1 {v1.2d}, [x3]
+; BE-STRICT-ALIGN-LABEL: qwordfloat
+; BE-STRICT-ALIGN: ldr
+; BE-STRICT-ALIGN: ldr
+; BE-STRICT-ALIGN: str
+; BE-STRICT-ALIGN: str
+entry:
+ %val.v4f32 = load <4 x float>* %head.v4f32, align 4
+ %val.v2f64 = load <2 x double>* %head.v2f64, align 8
+ store <4 x float> %val.v4f32, <4 x float>* %tail.v4f32, align 4
+ store <2 x double> %val.v2f64, <2 x double>* %tail.v2f64, align 8
+ ret <4 x float> %val.v4f32
+}
+
+;; Check element-aligned 64-bit vector load/store - integer
+define <8 x i8> @dwordint (<8 x i8>* %head.v8i8, <4 x i16>*
%head.v4i16, <2 x i32>* %head.v2i32, <1 x i64>* %head.v1i64,
+ <8 x i8>* %tail.v8i8, <4 x i16>*
%tail.v4i16, <2 x i32>* %tail.v2i32, <1 x i64>* %tail.v1i64) {
+; CHECK-LABEL: dwordint
+; CHECK: ld1 {v0.8b}, [x0]
+; CHECK: ld1 {v1.4h}, [x1]
+; CHECK: ld1 {v2.2s}, [x2]
+; CHECK: ld1 {v3.1d}, [x3]
+; CHECK: st1 {v0.8b}, [x4]
+; CHECK: st1 {v1.4h}, [x5]
+; CHECK: st1 {v2.2s}, [x6]
+; CHECK: st1 {v3.1d}, [x7]
+; BE-STRICT-ALIGN-LABEL: dwordint
+; BE-STRICT-ALIGN: ldrb
+; BE-STRICT-ALIGN: ldrh
+; BE-STRICT-ALIGN: ldr
+; BE-STRICT-ALIGN: ld1 {v1.1d}, [x3]
+; BE-STRICT-ALIGN: strb
+; BE-STRICT-ALIGN: strh
+; BE-STRICT-ALIGN: str
+; BE-STRICT-ALIGN: st1 {v1.1d}, [x7]
+entry:
+ %val.v8i8 = load <8 x i8>* %head.v8i8, align 1
+ %val.v4i16 = load <4 x i16>* %head.v4i16, align 2
+ %val.v2i32 = load <2 x i32>* %head.v2i32, align 4
+ %val.v1i64 = load <1 x i64>* %head.v1i64, align 8
+ store <8 x i8> %val.v8i8, <8 x i8>* %tail.v8i8 , align 1
+ store <4 x i16> %val.v4i16, <4 x i16>* %tail.v4i16, align 2
+ store <2 x i32> %val.v2i32, <2 x i32>* %tail.v2i32, align 4
+ store <1 x i64> %val.v1i64, <1 x i64>* %tail.v1i64, align 8
+ ret <8 x i8> %val.v8i8
+}
+
+;; Check element-aligned 64-bit vector load/store - floating point
+define <2 x float> @dwordfloat (<2 x float>* %head.v2f32, <1 x double>*
%head.v1f64,
+ <2 x float>* %tail.v2f32, <1 x double>*
%tail.v1f64) {
+; CHECK-LABEL: dwordfloat
+; CHECK: ld1 {v0.2s}, [x0]
+; CHECK: ld1 {v1.1d}, [x1]
+; CHECK: st1 {v0.2s}, [x2]
+; CHECK: st1 {v1.1d}, [x3]
+; BE-STRICT-ALIGN-LABEL: dwordfloat
+; BE-STRICT-ALIGN: ldr
+; BE-STRICT-ALIGN: ld1 {v1.1d}, [x1]
+; BE-STRICT-ALIGN: str
+; BE-STRICT-ALIGN: st1 {v1.1d}, [x3]
+entry:
+ %val.v2f32 = load <2 x float>* %head.v2f32, align 4
+ %val.v1f64 = load <1 x double>* %head.v1f64, align 8
+ store <2 x float> %val.v2f32, <2 x float>* %tail.v2f32, align 4
+ store <1 x double> %val.v1f64, <1 x double>* %tail.v1f64, align 8
+ ret <2 x float> %val.v2f32
+}
+
+;; Check load/store of 128-bit vectors with less-than 16-byte alignment
+define <2 x i64> @align2vi64 (<2 x i64>* %head.byte, <2 x i64>*
%head.half, <2 x i64>* %head.word, <2 x i64>* %head.dword,
+ <2 x i64>* %tail.byte, <2 x i64>*
%tail.half, <2 x i64>* %tail.word, <2 x i64>* %tail.dword) {
+; CHECK-LABEL: align2vi64
+; CHECK: ld1 {v0.2d}, [x0]
+; CHECK: ld1 {v1.2d}, [x1]
+; CHECK: ld1 {v2.2d}, [x2]
+; CHECK: ld1 {v3.2d}, [x3]
+; CHECK: st1 {v0.2d}, [x4]
+; CHECK: st1 {v1.2d}, [x5]
+; CHECK: st1 {v2.2d}, [x6]
+; CHECK: st1 {v3.2d}, [x7]
+; BE-STRICT-ALIGN-LABEL: align2vi64
+; BE-STRICT-ALIGN: ldrb
+; BE-STRICT-ALIGN: ldrh
+; BE-STRICT-ALIGN: ldr
+; BE-STRICT-ALIGN: strb
+; BE-STRICT-ALIGN: strh
+; BE-STRICT-ALIGN: str
+entry:
+ %val.byte = load <2 x i64>* %head.byte, align 1
+ %val.half = load <2 x i64>* %head.half, align 2
+ %val.word = load <2 x i64>* %head.word, align 4
+ %val.dword = load <2 x i64>* %head.dword, align 8
+ store <2 x i64> %val.byte, <2 x i64>* %tail.byte, align 1
+ store <2 x i64> %val.half, <2 x i64>* %tail.half, align 2
+ store <2 x i64> %val.word, <2 x i64>* %tail.word, align 4
+ store <2 x i64> %val.dword, <2 x i64>* %tail.dword, align 8
+ ret <2 x i64> %val.byte
+}
+
+;; Check load/store of 64-bit vectors with less-than 8-byte alignment
+define <2 x float> @align2vf32 (<2 x float>* %head.byte, <2 x float>*
%head.half, <2 x float>* %head.word, <2 x float>* %head.dword,
+ <2 x float>* %tail.byte, <2 x float>*
%tail.half, <2 x float>* %tail.word, <2 x float>* %tail.dword) {
+; CHECK-LABEL: align2vf32
+; CHECK: ld1 {v0.2s}, [x0]
+; CHECK: ld1 {v1.2s}, [x1]
+; CHECK: ld1 {v2.2s}, [x2]
+; CHECK: st1 {v0.2s}, [x4]
+; CHECK: st1 {v1.2s}, [x5]
+; CHECK: st1 {v2.2s}, [x6]
+; BE-STRICT-ALIGN-LABEL: align2vf32
+; BE-STRICT-ALIGN: ldrb
+; BE-STRICT-ALIGN: ldrh
+; BE-STRICT-ALIGN: ldr
+; BE-STRICT-ALIGN: strb
+; BE-STRICT-ALIGN: strh
+; BE-STRICT-ALIGN: str
+entry:
+ %val.byte = load <2 x float>* %head.byte, align 1
+ %val.half = load <2 x float>* %head.half, align 2
+ %val.word = load <2 x float>* %head.word, align 4
+ store <2 x float> %val.byte, <2 x float>* %tail.byte, align 1
+ store <2 x float> %val.half, <2 x float>* %tail.half, align 2
+ store <2 x float> %val.word, <2 x float>* %tail.word, align 4
+ ret <2 x float> %val.byte
+}
- {F54319, layout=link}
This revision includes additional tests exercise new flag -aarch64-no-strict-align and -aarch64-strict-align on both BE and LE.
BTW, I don't have commit access to llvm trunk. If this revision looks good to you, please merge it. Thank you.
Committed as r206557.
I think the test case relates to big-endian and needs to be ported to ARM64
later on.
Thanks,
-Jiangning
2014-04-18 8:43 GMT+08:00 Z. Zheng <zhaoshiz@codeaurora.org>:
This revision includes additional tests exercise new flag-aarch64-no-strict-align and -aarch64-strict-align on both BE and LE.
BTW, I don't have commit access to llvm trunk. If this revision looksgood to you, please merge it. Thank you.
Hi Jiangning, t.p.northover,
CHANGE SINCE LAST DIFF
http://reviews.llvm.org/D3319?vs=8427&id=8616#tocFiles:
lib/Target/AArch64/AArch64ISelLowering.cpp lib/Target/AArch64/AArch64ISelLowering.h lib/Target/AArch64/AArch64Subtarget.cpp lib/Target/AArch64/AArch64Subtarget.h test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll