diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6722,6 +6722,46 @@ def : Ld1Lane128Pat; def : Ld1Lane128Pat; +// Generate LD1 for extload if memory type does not match the +// destination type, for example: +// +// (v4i32 (insert_vector_elt (load anyext from i8) idx)) +// +// In this case, the index must be adjusted to match LD1 type. +// +class Ld1Lane128IdxOpPat + : Pat<(vector_insert (VTy VecListOne128:$Rd), + (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), + (LD1 VecListOne128:$Rd, (IdxOp VecIndex:$idx), GPR64sp:$Rn)>; + +def VectorIndexStoH : SDNodeXFormgetTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64); +}]>; +def VectorIndexStoB : SDNodeXFormgetTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64); +}]>; +def VectorIndexHtoB : SDNodeXFormgetTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64); +}]>; + +def : Ld1Lane128IdxOpPat; +def : Ld1Lane128IdxOpPat; +def : Ld1Lane128IdxOpPat; + +// Same as above, but the first element is populated using +// scalar_to_vector + insert_subvector instead of insert_vector_elt. +class Ld1Lane128FirstElm + : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))), + (ResultTy (EXTRACT_SUBREG + (LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>; + +def : Ld1Lane128FirstElm; +def : Ld1Lane128FirstElm; +def : Ld1Lane128FirstElm; + class Ld1Lane64Pat : Pat<(vector_insert (VTy VecListOne64:$Rd), diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-LE +; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE + +define <2 x i16> @test0(i16* %i16_ptr, i64 %inc) { +; CHECK-LE-LABEL: test0: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test0: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %i_0 = load i16, i16* %i16_ptr + %v0 = insertelement <2 x i16> undef, i16 %i_0, i32 0 + ret <2 x i16> %v0 +} + +define <2 x i16> @test1(<2 x i16>* %v2i16_ptr) { +; CHECK-LE-LABEL: test1: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-LE-NEXT: add x8, x0, #2 // =2 +; CHECK-LE-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test1: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-BE-NEXT: add x8, x0, #2 // =2 +; CHECK-BE-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %v2i16 = load <2 x i16>, <2 x i16>* %v2i16_ptr + ret <2 x i16> %v2i16 +} + +define <2 x i16> @test2(i16* %i16_ptr, i64 %inc) { +; CHECK-LE-LABEL: test2: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-LE-NEXT: add x8, x0, x1, lsl #1 +; CHECK-LE-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test2: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ld1 { v0.h }[0], [x0] +; CHECK-BE-NEXT: add x8, x0, x1, lsl #1 +; CHECK-BE-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %i_0 = load i16, i16* %i16_ptr + %i16_ptr_inc = getelementptr i16, i16* %i16_ptr, i64 %inc + %i_1 = load i16, i16* %i16_ptr_inc + %v0 = insertelement <2 x i16> undef, i16 %i_0, i32 0 + %v1 = insertelement <2 x i16> %v0, i16 %i_1, i32 1 + ret <2 x i16> %v1 +} + +define <2 x i8> @test3(<2 x i8>* %v2i8_ptr) { +; CHECK-LE-LABEL: test3: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-LE-NEXT: add x8, x0, #1 // =1 +; CHECK-LE-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test3: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-BE-NEXT: add x8, x0, #1 // =1 +; CHECK-BE-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %v2i8 = load <2 x i8>, <2 x i8>* %v2i8_ptr + ret <2 x i8> %v2i8 +} + +define <4 x i8> @test4(<4 x i8>* %v4i8_ptr) { +; CHECK-LE-LABEL: test4: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-LE-NEXT: add x8, x0, #1 // =1 +; CHECK-LE-NEXT: ld1 { v0.b }[2], [x8] +; CHECK-LE-NEXT: add x8, x0, #2 // =2 +; CHECK-LE-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-LE-NEXT: add x8, x0, #3 // =3 +; CHECK-LE-NEXT: ld1 { v0.b }[6], [x8] +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: test4: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ld1 { v0.b }[0], [x0] +; CHECK-BE-NEXT: add x8, x0, #1 // =1 +; CHECK-BE-NEXT: ld1 { v0.b }[2], [x8] +; CHECK-BE-NEXT: add x8, x0, #2 // =2 +; CHECK-BE-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-BE-NEXT: add x8, x0, #3 // =3 +; CHECK-BE-NEXT: ld1 { v0.b }[6], [x8] +; CHECK-BE-NEXT: rev64 v0.4h, v0.4h +; CHECK-BE-NEXT: ret + %v4i8 = load <4 x i8>, <4 x i8>* %v4i8_ptr + ret <4 x i8> %v4i8 +} diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -145,17 +145,15 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x1] -; CHECK-NEXT: ldrb w10, [x0, #1] -; CHECK-NEXT: ldrb w11, [x1, #1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: ld1 { v0.b }[0], [x1] +; CHECK-NEXT: ld1 { v1.b }[0], [x0] +; CHECK-NEXT: add x8, x0, #1 // =1 +; CHECK-NEXT: add x9, x1, #1 // =1 +; CHECK-NEXT: ld1 { v0.b }[4], [x9] +; CHECK-NEXT: ld1 { v1.b }[4], [x8] ; CHECK-NEXT: shl v0.2s, v0.2s, #24 -; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: sqadd v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 @@ -187,17 +185,15 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x1] -; CHECK-NEXT: ldrh w10, [x0, #2] -; CHECK-NEXT: ldrh w11, [x1, #2] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: ld1 { v0.h }[0], [x1] +; CHECK-NEXT: ld1 { v1.h }[0], [x0] +; CHECK-NEXT: add x8, x0, #2 // =2 +; CHECK-NEXT: add x9, x1, #2 // =2 +; CHECK-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-NEXT: ld1 { v1.h }[2], [x8] ; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: sqadd v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -146,17 +146,15 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x1] -; CHECK-NEXT: ldrb w10, [x0, #1] -; CHECK-NEXT: ldrb w11, [x1, #1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: ld1 { v0.b }[0], [x1] +; CHECK-NEXT: ld1 { v1.b }[0], [x0] +; CHECK-NEXT: add x8, x0, #1 // =1 +; CHECK-NEXT: add x9, x1, #1 // =1 +; CHECK-NEXT: ld1 { v0.b }[4], [x9] +; CHECK-NEXT: ld1 { v1.b }[4], [x8] ; CHECK-NEXT: shl v0.2s, v0.2s, #24 -; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: sqsub v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 @@ -188,17 +186,15 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x1] -; CHECK-NEXT: ldrh w10, [x0, #2] -; CHECK-NEXT: ldrh w11, [x1, #2] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: ld1 { v0.h }[0], [x1] +; CHECK-NEXT: ld1 { v1.h }[0], [x0] +; CHECK-NEXT: add x8, x0, #2 // =2 +; CHECK-NEXT: add x9, x1, #2 // =2 +; CHECK-NEXT: ld1 { v0.h }[2], [x9] +; CHECK-NEXT: ld1 { v1.h }[2], [x8] ; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: sqsub v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0