Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td @@ -4592,10 +4592,8 @@ def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>; def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>; -def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>; -def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>; - // EDIT per word & halfword: 2s, 4h, 4s, & 8h +let isReMaterializable = 1, isAsCheapAsAMove = 1 in defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">; def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; @@ -4617,6 +4615,7 @@ def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))), (MOVIv8i16 imm0_255:$imm8, imm:$shift)>; +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { // EDIT per word: 2s & 4s with MSL shifter def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s", [(set (v2i32 V64:$Rd), @@ -4629,13 +4628,31 @@ def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255, "movi", ".8b", [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>; + def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255, "movi", ".16b", [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>; +} + +// Use the more efficient MOVI instead of DUP from ZR to zero up vectors +def : Pat<(v2f32 (AArch64dup (f32 fpimm0))), (MOVIv2i32 (i32 0), (i32 0))>; + +def : Pat<(v2i32 (AArch64dup (i32 0))), (MOVIv2i32 (i32 0), (i32 0))>; +def : Pat<(v4i16 (AArch64dup (i32 0))), (MOVIv4i16 (i32 0), (i32 0))>; +def : Pat<(v8i8 (AArch64dup (i32 0))), (MOVIv8b_ns (i32 0))>; + +def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>; +def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv4i32 (i32 0), (i32 0))>; + +def : Pat<(v2i64 (AArch64dup (i64 0))), (MOVIv2d_ns (i32 0))>; +def : Pat<(v4i32 (AArch64dup (i32 0))), (MOVIv4i32 (i32 0), (i32 0))>; +def : Pat<(v8i16 (AArch64dup (i32 0))), (MOVIv8i16 (i32 0), (i32 0))>; +def : Pat<(v16i8 (AArch64dup (i32 0))), (MOVIv16b_ns (i32 0))>; // AdvSIMD MVNI // EDIT per word & halfword: 2s, 4h, 4s, & 8h +let isReMaterializable = 1, isAsCheapAsAMove = 1 in defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">; def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; @@ -4658,12 +4675,14 @@ (MVNIv8i16 imm0_255:$imm8, imm:$shift)>; // EDIT per word: 2s & 4s with MSL shifter +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s", [(set (v2i32 V64:$Rd), (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>; def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", [(set (v4i32 V128:$Rd), (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>; +} //---------------------------------------------------------------------------- // AdvSIMD indexed element Index: llvm/trunk/test/CodeGen/AArch64/arm64-build-vector.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-build-vector.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-build-vector.ll @@ -1,23 +1,5 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s -; Check that building up a vector w/ only one non-zero lane initializes -; intelligently. -define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind { -; CHECK-LABEL: one_lane: -; CHECK: dup.16b v[[REG:[0-9]+]], wzr -; CHECK-NEXT: mov.b v[[REG]][0], w1 -; v and q are aliases, and str is preferred against st.16b when possible -; rdar://11246289 -; CHECK: str q[[REG]], [x0] -; CHECK: ret - %conv = trunc i32 %skip0 to i8 - %vset_lane = insertelement <16 x i8> , i8 %conv, i32 0 - %tmp = bitcast i32* %out_int to <4 x i32>* - %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32> - store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16 - ret void -} - ; Check that building a vector from floats doesn't insert an unnecessary ; copy for lane zero. define <4 x float> @foo(float %a, float %b, float %c, float %d) nounwind { Index: llvm/trunk/test/CodeGen/AArch64/arm64-vector-insertion.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-vector-insertion.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-vector-insertion.ll @@ -8,7 +8,7 @@ ret void ; CHECK-LABEL: test0f - ; CHECK: movi.2d v[[TEMP:[0-9]+]], #0000000000000000 + ; CHECK: movi.4s v[[TEMP:[0-9]+]], #0 ; CHECK: mov.s v[[TEMP]][0], v{{[0-9]+}}[0] ; CHECK: str q[[TEMP]], [x0] ; CHECK: ret @@ -16,7 +16,6 @@ } - define void @test1f(float* nocapture %x, float %a) #0 { entry: %0 = insertelement <4 x float> , float %a, i32 0 Index: llvm/trunk/test/CodeGen/AArch64/build-one-lane.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/build-one-lane.ll +++ llvm/trunk/test/CodeGen/AArch64/build-one-lane.ll @@ -0,0 +1,84 @@ +; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s + +; Check that building up a vector w/ only one non-zero lane initializes +; intelligently. + +define <8 x i8> @v8i8(i8 %t, i8 %s) nounwind { + %v = insertelement <8 x i8> , i8 %s, i32 7 + ret <8 x i8> %v + +; CHECK: movi v[[R:[0-9]+]].8b, #0 +; CHECK: mov v[[R]].b[7], w{{[0-9]+}} +} + +define <16 x i8> @v16i8(i8 %t, i8 %s) nounwind { + %v = insertelement <16 x i8> , i8 %s, i32 15 + ret <16 x i8> %v + +; CHECK: movi v[[R:[0-9]+]].16b, #0 +; CHECK: mov v[[R]].b[15], w{{[0-9]+}} +} + +define <4 x i16> @v4i16(i16 %t, i16 %s) nounwind { + %v = insertelement <4 x i16> , i16 %s, i32 3 + ret <4 x i16> %v + +; CHECK: movi v[[R:[0-9]+]].4h, #0 +; CHECK: mov v[[R]].h[3], w{{[0-9]+}} +} + +define <8 x i16> @v8i16(i16 %t, i16 %s) nounwind { + %v = insertelement <8 x i16> , i16 %s, i32 7 + ret <8 x i16> %v + +; CHECK: movi v[[R:[0-9]+]].8h, #0 +; CHECK: mov v[[R]].h[7], w{{[0-9]+}} +} + +define <2 x i32> @v2i32(i32 %t, i32 %s) nounwind { + %v = insertelement <2 x i32> , i32 %s, i32 1 + ret <2 x i32> %v + +; CHECK: movi v[[R:[0-9]+]].2s, #0 +; CHECK: mov v[[R]].s[1], w{{[0-9]+}} +} + +define <4 x i32> @v4i32(i32 %t, i32 %s) nounwind { + %v = insertelement <4 x i32> , i32 %s, i32 3 + ret <4 x i32> %v + +; CHECK: movi v[[R:[0-9]+]].4s, #0 +; CHECK: mov v[[R]].s[3], w{{[0-9]+}} +} + +define <2 x i64> @v2i64(i64 %t, i64 %s) nounwind { + %v = insertelement <2 x i64> , i64 %s, i32 1 + ret <2 x i64> %v + +; CHECK: movi v[[R:[0-9]+]].2d, #0 +; CHECK: mov v[[R]].d[1], x{{[0-9]+}} +} + +define <2 x float> @v2f32(float %t, float %s) nounwind { + %v = insertelement <2 x float> , float %s, i32 1 + ret <2 x float> %v + +; CHECK: movi v[[R:[0-9]+]].2s, #0 +; CHECK: mov v[[R]].s[1], v{{[0-9]+}}.s[0] +} + +define <4 x float> @v4f32(float %t, float %s) nounwind { + %v = insertelement <4 x float> , float %s, i32 3 + ret <4 x float> %v + +; CHECK: movi v[[R:[0-9]+]].4s, #0 +; CHECK: mov v[[R]].s[3], v{{[0-9]+}}.s[0] +} + +define <2 x double> @v2f64(double %t, double %s) nounwind { + %v = insertelement <2 x double> , double %s, i32 1 + ret <2 x double> %v + +; CHECK: movi v[[R:[0-9]+]].2d, #0 +; CHECK: mov v[[R]].d[1], v{{[0-9]+}}.d[0] +}