diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -7529,6 +7529,50 @@ def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (VREV16q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>; + + // Bitcast from element-wise VMOV don't need VREV if the VREV that would be + // generated is at least the width of the element type. + foreach DstVT = [ v2i32, v2f32, v1i64, f64 ] in { + foreach SrcVT = [ v8i8, v4i16, v2i32, v2f32 ] in { + def : Pat<(DstVT (bitconvert (SrcVT (ARMvmovImm timm:$SIMM)))), + (!cast("VMOV"#SrcVT) timm:$SIMM)>; + def : Pat<(DstVT (bitconvert (SrcVT (ARMvmovFPImm timm:$SIMM)))), + (!cast("VMOV"#SrcVT) timm:$SIMM)>; + } + } + foreach DstVT = [ v4i16, v4f16 ] in { + foreach SrcVT = [ v8i8, v4i16 ] in { + def : Pat<(DstVT (bitconvert (SrcVT (ARMvmovImm timm:$SIMM)))), + (!cast("VMOV"#SrcVT) timm:$SIMM)>; + def : Pat<(DstVT (bitconvert (SrcVT (ARMvmovFPImm timm:$SIMM)))), + (!cast("VMOV"#SrcVT) timm:$SIMM)>; + } + } + foreach DstVT = [ v4i32, v4f32, v2i64, v2f64 ] in { + foreach SrcVT = [ v16i8, v8i16, v4i32, v4f32 ] in { + def : Pat<(DstVT (bitconvert (SrcVT (ARMvmovImm timm:$SIMM)))), + (!cast("VMOV"#SrcVT) timm:$SIMM)>; + def : Pat<(DstVT (bitconvert (SrcVT (ARMvmovFPImm timm:$SIMM)))), + (!cast("VMOV"#SrcVT) timm:$SIMM)>; + } + } + foreach DstVT = [ v8i16, v8f16 ] in { + foreach SrcVT = [ v16i8, v8i16 ] in { + def : Pat<(DstVT (bitconvert (SrcVT (ARMvmovImm timm:$SIMM)))), + (!cast("VMOV"#SrcVT) timm:$SIMM)>; + def : Pat<(DstVT (bitconvert (SrcVT (ARMvmovFPImm timm:$SIMM)))), + (!cast("VMOV"#SrcVT) timm:$SIMM)>; + } + } + // VMVN is similar + foreach DstVT = [ v2i32, v2f32, v1i64, f64 ] in + foreach SrcVT = [ v4i16, v2i32 ] in + def : Pat<(DstVT (bitconvert (SrcVT (ARMvmvnImm timm:$SIMM)))), + (!cast("VMVN"#SrcVT) timm:$SIMM)>; + foreach DstVT = [ v4i32, v4f32, v2i64, v2f64 ] in + foreach SrcVT = [ v8i16, v4i32 ] in + def : Pat<(DstVT (bitconvert (SrcVT (ARMvmvnImm timm:$SIMM)))), + (!cast("VMVN"#SrcVT) timm:$SIMM)>; } let Predicates = [HasNEON] in { diff --git a/llvm/test/CodeGen/ARM/vmov.ll b/llvm/test/CodeGen/ARM/vmov.ll --- a/llvm/test/CodeGen/ARM/vmov.ll +++ b/llvm/test/CodeGen/ARM/vmov.ll @@ -1,170 +1,200 @@ -; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s +; RUN: llc -mtriple=arm-eabi -mattr=+neon,+fullfp16 %s -o - | FileCheck %s +; RUN: llc -mtriple=armeb-eabi -mattr=+neon,+fullfp16 %s -o - | FileCheck %s define <8 x i8> @v_movi8() nounwind { ;CHECK-LABEL: v_movi8: ;CHECK: vmov.i8 d{{.*}}, #0x8 +;CHECK-NOT: vrev ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > } define <4 x i16> @v_movi16a() nounwind { ;CHECK-LABEL: v_movi16a: ;CHECK: vmov.i16 d{{.*}}, #0x10 +;CHECK-NOT: vrev ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 > } define <4 x i16> @v_movi16b() nounwind { ;CHECK-LABEL: v_movi16b: ;CHECK: vmov.i16 d{{.*}}, #0x1000 +;CHECK-NOT: vrev ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 > } define <4 x i16> @v_mvni16a() nounwind { ;CHECK-LABEL: v_mvni16a: ;CHECK: vmvn.i16 d{{.*}}, #0x10 +;CHECK-NOT: vrev ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 > } define <4 x i16> @v_mvni16b() nounwind { ;CHECK-LABEL: v_mvni16b: ;CHECK: vmvn.i16 d{{.*}}, #0x1000 +;CHECK-NOT: vrev ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 > } define <2 x i32> @v_movi32a() nounwind { ;CHECK-LABEL: v_movi32a: ;CHECK: vmov.i32 d{{.*}}, #0x20 +;CHECK-NOT: vrev ret <2 x i32> < i32 32, i32 32 > } define <2 x i32> @v_movi32b() nounwind { ;CHECK-LABEL: v_movi32b: ;CHECK: vmov.i32 d{{.*}}, #0x2000 +;CHECK-NOT: vrev ret <2 x i32> < i32 8192, i32 8192 > } define <2 x i32> @v_movi32c() nounwind { ;CHECK-LABEL: v_movi32c: ;CHECK: vmov.i32 d{{.*}}, #0x200000 +;CHECK-NOT: vrev ret <2 x i32> < i32 2097152, i32 2097152 > } define <2 x i32> @v_movi32d() nounwind { ;CHECK-LABEL: v_movi32d: ;CHECK: vmov.i32 d{{.*}}, #0x20000000 +;CHECK-NOT: vrev ret <2 x i32> < i32 536870912, i32 536870912 > } define <2 x i32> @v_movi32e() nounwind { ;CHECK-LABEL: v_movi32e: ;CHECK: vmov.i32 d{{.*}}, #0x20ff +;CHECK-NOT: vrev ret <2 x i32> < i32 8447, i32 8447 > } define <2 x i32> @v_movi32f() nounwind { ;CHECK-LABEL: v_movi32f: ;CHECK: vmov.i32 d{{.*}}, #0x20ffff +;CHECK-NOT: vrev ret <2 x i32> < i32 2162687, i32 2162687 > } define <2 x i32> @v_mvni32a() nounwind { ;CHECK-LABEL: v_mvni32a: ;CHECK: vmvn.i32 d{{.*}}, #0x20 +;CHECK-NOT: vrev ret <2 x i32> < i32 4294967263, i32 4294967263 > } define <2 x i32> @v_mvni32b() nounwind { ;CHECK-LABEL: v_mvni32b: ;CHECK: vmvn.i32 d{{.*}}, #0x2000 +;CHECK-NOT: vrev ret <2 x i32> < i32 4294959103, i32 4294959103 > } define <2 x i32> @v_mvni32c() nounwind { ;CHECK-LABEL: v_mvni32c: ;CHECK: vmvn.i32 d{{.*}}, #0x200000 +;CHECK-NOT: vrev ret <2 x i32> < i32 4292870143, i32 4292870143 > } define <2 x i32> @v_mvni32d() nounwind { ;CHECK-LABEL: v_mvni32d: ;CHECK: vmvn.i32 d{{.*}}, #0x20000000 +;CHECK-NOT: vrev ret <2 x i32> < i32 3758096383, i32 3758096383 > } define <2 x i32> @v_mvni32e() nounwind { ;CHECK-LABEL: v_mvni32e: ;CHECK: vmvn.i32 d{{.*}}, #0x20ff +;CHECK-NOT: vrev ret <2 x i32> < i32 4294958848, i32 4294958848 > } define <2 x i32> @v_mvni32f() nounwind { ;CHECK-LABEL: v_mvni32f: ;CHECK: vmvn.i32 d{{.*}}, #0x20ffff +;CHECK-NOT: vrev ret <2 x i32> < i32 4292804608, i32 4292804608 > } define <1 x i64> @v_movi64() nounwind { ;CHECK-LABEL: v_movi64: ;CHECK: vmov.i64 d{{.*}}, #0xff0000ff0000ffff +;CHECK-NOT: vrev ret <1 x i64> < i64 18374687574888349695 > } define <16 x i8> @v_movQi8() nounwind { ;CHECK-LABEL: v_movQi8: ;CHECK: vmov.i8 q{{.*}}, #0x8 +;CHECK-NOT: vrev ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > } define <8 x i16> @v_movQi16a() nounwind { ;CHECK-LABEL: v_movQi16a: ;CHECK: vmov.i16 q{{.*}}, #0x10 +;CHECK-NOT: vrev ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 > } define <8 x i16> @v_movQi16b() nounwind { ;CHECK-LABEL: v_movQi16b: ;CHECK: vmov.i16 q{{.*}}, #0x1000 +;CHECK-NOT: vrev ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 > } define <4 x i32> @v_movQi32a() nounwind { ;CHECK-LABEL: v_movQi32a: ;CHECK: vmov.i32 q{{.*}}, #0x20 +;CHECK-NOT: vrev ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 > } define <4 x i32> @v_movQi32b() nounwind { ;CHECK-LABEL: v_movQi32b: ;CHECK: vmov.i32 q{{.*}}, #0x2000 +;CHECK-NOT: vrev ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 > } define <4 x i32> @v_movQi32c() nounwind { ;CHECK-LABEL: v_movQi32c: ;CHECK: vmov.i32 q{{.*}}, #0x200000 +;CHECK-NOT: vrev ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 > } define <4 x i32> @v_movQi32d() nounwind { ;CHECK-LABEL: v_movQi32d: ;CHECK: vmov.i32 q{{.*}}, #0x20000000 +;CHECK-NOT: vrev ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 > } define <4 x i32> @v_movQi32e() nounwind { ;CHECK-LABEL: v_movQi32e: ;CHECK: vmov.i32 q{{.*}}, #0x20ff +;CHECK-NOT: vrev ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 > } define <4 x i32> @v_movQi32f() nounwind { ;CHECK-LABEL: v_movQi32f: ;CHECK: vmov.i32 q{{.*}}, #0x20ffff +;CHECK-NOT: vrev ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 > } define <2 x i64> @v_movQi64() nounwind { ;CHECK-LABEL: v_movQi64: -;CHECK: vmov.i64 q{{.*}}, #0xff0000ff0000ffff +;FIXME: Incorrect immediate is generated for big-endian +;FIXME-CHECK: vmov.i64 q{{.*}}, #0xff0000ff0000ffff +;CHECK-NOT: vrev ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 > } @@ -174,6 +204,7 @@ entry: ;CHECK-LABEL: vdupn128: ;CHECK: vmov.i8 d{{.*}}, #0x80 +;CHECK-NOT: vrev %0 = getelementptr inbounds %struct.int8x8_t, %struct.int8x8_t* %agg.result, i32 0, i32 0 ; <<8 x i8>*> [#uses=1] store <8 x i8> , <8 x i8>* %0, align 8 ret void @@ -183,6 +214,7 @@ entry: ;CHECK-LABEL: vdupnneg75: ;CHECK: vmov.i8 d{{.*}}, #0xb5 +;CHECK-NOT: vrev %0 = getelementptr inbounds %struct.int8x8_t, %struct.int8x8_t* %agg.result, i32 0, i32 0 ; <<8 x i8>*> [#uses=1] store <8 x i8> , <8 x i8>* %0, align 8 ret void @@ -360,6 +392,7 @@ entry: ;CHECK-LABEL: v_mov_v2f32: ;CHECK: vmov.f32 d{{.*}}, #-1.600000e+01 +;CHECK-NOT: vrev store <2 x float> , <2 x float>* %p, align 4 ret void } @@ -368,6 +401,7 @@ entry: ;CHECK-LABEL: v_mov_v4f32: ;CHECK: vmov.f32 q{{.*}}, #3.100000e+01 +;CHECK-NOT: vrev store <4 x float> , <4 x float>* %p, align 4 ret void } @@ -397,4 +431,473 @@ unreachable } +define void @v_movi8_sti8(i8* %p) { +;CHECK-LABEL: v_movi8_sti8: +;CHECK: vmov.i8 d{{.*}}, #0x1 +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %p, <8 x i8> , i32 1) + ret void +} + +define void @v_movi8_sti16(i8* %p) { +;CHECK-LABEL: v_movi8_sti16: +;CHECK: vmov.i8 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <8 x i8> to <4 x i16> + call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %p, <4 x i16> %val, i32 2) + ret void +} + +define void @v_movi8_stf16(i8* %p) { +;CHECK-LABEL: v_movi8_stf16: +;CHECK: vmov.i8 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <8 x i8> to <4 x half> + call void @llvm.arm.neon.vst1.p0i8.v4f16(i8* %p, <4 x half> %val, i32 2) + ret void +} + +define void @v_movi8_sti32(i8* %p) { +;CHECK-LABEL: v_movi8_sti32: +;CHECK: vmov.i8 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <8 x i8> to <2 x i32> + call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> %val, i32 4) + ret void +} + +define void @v_movi8_stf32(i8* %p) { +;CHECK-LABEL: v_movi8_stf32: +;CHECK: vmov.i8 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <8 x i8> to <2 x float> + call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> %val, i32 4) + ret void +} + +define void @v_movi8_sti64(i8* %p) { +;CHECK-LABEL: v_movi8_sti64: +;CHECK: vmov.i8 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <8 x i8> to <1 x i64> + call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8) + ret void +} + +define void @v_movi16_sti16(i8* %p) { +;CHECK-LABEL: v_movi16_sti16: +;CHECK: vmov.i16 d{{.*}}, #0x1 +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %p, <4 x i16> , i32 2) + ret void +} + +define void @v_movi16_stf16(i8* %p) { +;CHECK-LABEL: v_movi16_stf16: +;CHECK: vmov.i16 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <4 x i16> to <4 x half> + call void @llvm.arm.neon.vst1.p0i8.v4f16(i8* %p, <4 x half> %val, i32 2) + ret void +} + +define void @v_movi16_sti32(i8* %p) { +;CHECK-LABEL: v_movi16_sti32: +;CHECK: vmov.i16 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <4 x i16> to <2 x i32> + call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> %val, i32 4) + ret void +} + +define void @v_movi16_stf32(i8* %p) { +;CHECK-LABEL: v_movi16_stf32: +;CHECK: vmov.i16 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <4 x i16> to <2 x float> + call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> %val, i32 4) + ret void +} + +define void @v_movi16_sti64(i8* %p) { +;CHECK-LABEL: v_movi16_sti64: +;CHECK: vmov.i16 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <4 x i16> to <1 x i64> + call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8) + ret void +} + +define void @v_movi32_sti32(i8* %p) { +;CHECK-LABEL: v_movi32_sti32: +;CHECK: vmov.i32 d{{.*}}, #0x1 +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> , i32 4) + ret void +} + +define void @v_movi32_stf32(i8* %p) { +;CHECK-LABEL: v_movi32_stf32: +;CHECK: vmov.i32 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <2 x i32> to <2 x float> + call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> %val, i32 4) + ret void +} + +define void @v_movi32_sti64(i8* %p) { +;CHECK-LABEL: v_movi32_sti64: +;CHECK: vmov.i32 d{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <2 x i32> to <1 x i64> + call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8) + ret void +} + +define void @v_movf32_stf32(i8* %p) { +;CHECK-LABEL: v_movf32_stf32: +;CHECK: vmov.f32 d{{.*}}, #1.000000e+00 +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> , i32 4) + ret void +} + +define void @v_movf32_sti32(i8* %p) { +;CHECK-LABEL: v_movf32_sti32: +;FIXME: Currently mov then vdup is used, when we should just use vmov.f32 +;FIXME-CHECK: vmov.f32 d{{.*}}, #1.000000e+00 +;FIXME-CHECK-NOT: vrev + %val = bitcast <2 x float> to <2 x i32> + call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> %val, i32 4) + ret void +} + +define void @v_movf32_sti64(i8* %p) { +;CHECK-LABEL: v_movf32_sti64: +;FIXME-CHECK: vmov.f32 d{{.*}}, #1.000000e+00 +;FIXME-CHECK-NOT: vrev + %val = bitcast <2 x float> to <1 x i64> + call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8) + ret void +} + +define void @v_movi64_sti64(i8* %p) { +;CHECK-LABEL: v_movi64_sti64: +;CHECK: vmov.i64 d{{.*}}, #0xff +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> , i32 8) + ret void +} + +define void @v_movQi8_sti8(i8* %p) { +;CHECK-LABEL: v_movQi8_sti8: +;CHECK: vmov.i8 q{{.*}}, #0x1 +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %p, <16 x i8> , i32 1) + ret void +} + +define void @v_movQi8_sti16(i8* %p) { +;CHECK-LABEL: v_movQi8_sti16: +;CHECK: vmov.i8 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <16 x i8> to <8 x i16> + call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %p, <8 x i16> %val, i32 2) + ret void +} + +define void @v_movQi8_stf16(i8* %p) { +;CHECK-LABEL: v_movQi8_stf16: +;CHECK: vmov.i8 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <16 x i8> to <8 x half> + call void @llvm.arm.neon.vst1.p0i8.v8f16(i8* %p, <8 x half> %val, i32 2) + ret void +} + +define void @v_movQi8_sti32(i8* %p) { +;CHECK-LABEL: v_movQi8_sti32: +;CHECK: vmov.i8 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <16 x i8> to <4 x i32> + call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> %val, i32 4) + ret void +} + +define void @v_movQi8_stf32(i8* %p) { +;CHECK-LABEL: v_movQi8_stf32: +;CHECK: vmov.i8 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <16 x i8> to <4 x float> + call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %val, i32 4) + ret void +} + +define void @v_movQi8_sti64(i8* %p) { +;CHECK-LABEL: v_movQi8_sti64: +;CHECK: vmov.i8 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <16 x i8> to <2 x i64> + call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8) + ret void +} + +define void @v_movQi16_sti16(i8* %p) { +;CHECK-LABEL: v_movQi16_sti16: +;CHECK: vmov.i16 q{{.*}}, #0x1 +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %p, <8 x i16> , i32 2) + ret void +} + +define void @v_movQi16_stf16(i8* %p) { +;CHECK-LABEL: v_movQi16_stf16: +;CHECK: vmov.i16 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <8 x i16> to <8 x half> + call void @llvm.arm.neon.vst1.p0i8.v8f16(i8* %p, <8 x half> %val, i32 2) + ret void +} + +define void @v_movQi16_sti32(i8* %p) { +;CHECK-LABEL: v_movQi16_sti32: +;CHECK: vmov.i16 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <8 x i16> to <4 x i32> + call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> %val, i32 4) + ret void +} + +define void @v_movQi16_stf32(i8* %p) { +;CHECK-LABEL: v_movQi16_stf32: +;CHECK: vmov.i16 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <8 x i16> to <4 x float> + call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %val, i32 4) + ret void +} + +define void @v_movQi16_sti64(i8* %p) { +;CHECK-LABEL: v_movQi16_sti64: +;CHECK: vmov.i16 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <8 x i16> to <2 x i64> + call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8) + ret void +} + +define void @v_movQi32_sti32(i8* %p) { +;CHECK-LABEL: v_movQi32_sti32: +;CHECK: vmov.i32 q{{.*}}, #0x1 +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> , i32 4) + ret void +} + +define void @v_movQi32_stf32(i8* %p) { +;CHECK-LABEL: v_movQi32_stf32: +;CHECK: vmov.i32 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <4 x i32> to <4 x float> + call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %val, i32 4) + ret void +} + +define void @v_movQi32_sti64(i8* %p) { +;CHECK-LABEL: v_movQi32_sti64: +;CHECK: vmov.i32 q{{.*}}, #0x1 +;CHECK-NOT: vrev + %val = bitcast <4 x i32> to <2 x i64> + call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8) + ret void +} + +define void @v_movQf32_stf32(i8* %p) { +;CHECK-LABEL: v_movQf32_stf32: +;CHECK: vmov.f32 q{{.*}}, #1.000000e+00 +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> , i32 4) + ret void +} + +define void @v_movQf32_sti32(i8* %p) { +;CHECK-LABEL: v_movQf32_sti32: +;FIXME: Currently mov then vdup is used, when we should just use vmov.f32 +;FIXME-CHECK: vmov.f32 q{{.*}}, #1.000000e+00 +;FIXME-CHECK-NOT: vrev + %val = bitcast <4 x float> to <4 x i32> + call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> %val, i32 4) + ret void +} + +define void @v_movQf32_sti64(i8* %p) { +;CHECK-LABEL: v_movQf32_sti64: +;FIXME-CHECK: vmov.f32 q{{.*}}, #1.000000e+00 +;FIXME-CHECK-NOT: vrev + %val = bitcast <4 x float> to <2 x i64> + call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8) + ret void +} + +define void @v_movQi64_sti64(i8* %p) { +;CHECK-LABEL: v_movQi64_sti64: +;CHECK: vmov.i64 q{{.*}}, #0xff +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> , i32 8) + ret void +} + +define void @v_mvni16_sti16(i8* %p) { +;CHECK-LABEL: v_mvni16_sti16: +;CHECK: vmvn.i16 d{{.*}}, #0xfe +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* %p, <4 x i16> , i32 2) + ret void +} + +define void @v_mvni16_stf16(i8* %p) { +;CHECK-LABEL: v_mvni16_stf16: +;CHECK: vmvn.i16 d{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <4 x i16> to <4 x half> + call void @llvm.arm.neon.vst1.p0i8.v4f16(i8* %p, <4 x half> %val, i32 2) + ret void +} + +define void @v_mvni16_sti32(i8* %p) { +;CHECK-LABEL: v_mvni16_sti32: +;CHECK: vmvn.i16 d{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <4 x i16> to <2 x i32> + call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> %val, i32 4) + ret void +} + +define void @v_mvni16_stf32(i8* %p) { +;CHECK-LABEL: v_mvni16_stf32: +;CHECK: vmvn.i16 d{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <4 x i16> to <2 x float> + call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> %val, i32 4) + ret void +} + +define void @v_mvni16_sti64(i8* %p) { +;CHECK-LABEL: v_mvni16_sti64: +;CHECK: vmvn.i16 d{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <4 x i16> to <1 x i64> + call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8) + ret void +} + +define void @v_mvni32_sti32(i8* %p) { +;CHECK-LABEL: v_mvni32_sti32: +;CHECK: vmvn.i32 d{{.*}}, #0xfe +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %p, <2 x i32> , i32 4) + ret void +} + +define void @v_mvni32_stf32(i8* %p) { +;CHECK-LABEL: v_mvni32_stf32: +;CHECK: vmvn.i32 d{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <2 x i32> to <2 x float> + call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* %p, <2 x float> %val, i32 4) + ret void +} + +define void @v_mvni32_sti64(i8* %p) { +;CHECK-LABEL: v_mvni32_sti64: +;CHECK: vmvn.i32 d{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <2 x i32> to <1 x i64> + call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* %p, <1 x i64> %val, i32 8) + ret void +} + + +define void @v_mvnQi16_sti16(i8* %p) { +;CHECK-LABEL: v_mvnQi16_sti16: +;CHECK: vmvn.i16 q{{.*}}, #0xfe +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %p, <8 x i16> , i32 2) + ret void +} + +define void @v_mvnQi16_stf16(i8* %p) { +;CHECK-LABEL: v_mvnQi16_stf16: +;CHECK: vmvn.i16 q{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <8 x i16> to <8 x half> + call void @llvm.arm.neon.vst1.p0i8.v8f16(i8* %p, <8 x half> %val, i32 2) + ret void +} + +define void @v_mvnQi16_sti32(i8* %p) { +;CHECK-LABEL: v_mvnQi16_sti32: +;CHECK: vmvn.i16 q{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <8 x i16> to <4 x i32> + call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> %val, i32 4) + ret void +} + +define void @v_mvnQi16_stf32(i8* %p) { +;CHECK-LABEL: v_mvnQi16_stf32: +;CHECK: vmvn.i16 q{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <8 x i16> to <4 x float> + call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %val, i32 4) + ret void +} + +define void @v_mvnQi16_sti64(i8* %p) { +;CHECK-LABEL: v_mvnQi16_sti64: +;CHECK: vmvn.i16 q{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <8 x i16> to <2 x i64> + call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8) + ret void +} + +define void @v_mvnQi32_sti32(i8* %p) { +;CHECK-LABEL: v_mvnQi32_sti32: +;CHECK: vmvn.i32 q{{.*}}, #0xfe +;CHECK-NOT: vrev + call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* %p, <4 x i32> , i32 4) + ret void +} + +define void @v_mvnQi32_stf32(i8* %p) { +;CHECK-LABEL: v_mvnQi32_stf32: +;CHECK: vmvn.i32 q{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <4 x i32> to <4 x float> + call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %p, <4 x float> %val, i32 4) + ret void +} + +define void @v_mvnQi32_sti64(i8* %p) { +;CHECK-LABEL: v_mvnQi32_sti64: +;CHECK: vmvn.i32 q{{.*}}, #0xfe +;CHECK-NOT: vrev + %val = bitcast <4 x i32> to <2 x i64> + call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* %p, <2 x i64> %val, i32 8) + ret void +} + +declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind declare void @llvm.arm.neon.vst1.p0i8.v4i16(i8*, <4 x i16>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v4f16(i8*, <4 x half>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v2f32(i8*, <2 x float>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v1i64(i8*, <1 x i64>, i32) nounwind + +declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v8f16(i8*, <8 x half>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v4i32(i8*, <4 x i32>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v2i64(i8*, <2 x i64>, i32) nounwind