Index: lib/Target/ARM/ARMInstrNEON.td =================================================================== --- lib/Target/ARM/ARMInstrNEON.td +++ lib/Target/ARM/ARMInstrNEON.td @@ -6694,6 +6694,7 @@ def : AlignedVEXTq; +def : AlignedVEXTq; // v8f16 -> v4f16 // VEXT : Vector Extract @@ -7124,156 +7125,209 @@ Requires<[HasNEON, DontUseVMOVSR]>; //===----------------------------------------------------------------------===// -// Non-Instruction Patterns +// Non-Instruction Patterns or Endiness - Revert Patterns //===----------------------------------------------------------------------===// // bit_convert +// 64 bit conversions +def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; +def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>; + +def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>; +def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; + +def : Pat<(v4i16 (bitconvert (v4f16 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (v4f16 DPR:$src)>; + +// 128 bit conversions +def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; +def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; + +def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; + +def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>; + let Predicates = [IsLE] in { + // 64 bit conversions + def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; + + def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>; -} -def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; + + def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; + + def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>; - def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; -} -def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; -let Predicates = [IsLE] in { + + def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v1i64 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v2f32 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (v4f16 DPR:$src)>; + + def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>; - def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; - def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; + + def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (v8i8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>; - def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; - def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; -} -def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; - def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; - def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>; - def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; - def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; - def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; -} -def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; -} -let Predicates = [IsLE] in { + // 128 bit conversions + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; + + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; -} -def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; + + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; -} -def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; -let Predicates = [IsLE] in { + + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; + + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; -} -def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; -} -def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; -let Predicates = [IsLE] in { - def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; } let Predicates = [IsBE] in { // 64 bit conversions + def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; + + def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; - def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; + + def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; + + def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; - def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; + + def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>; + + def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>; - def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>; - def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>; + + def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (VREV64d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (VREV32d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (VREV16d8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (VREV16d8 DPR:$src)>; - def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>; - def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>; - def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; - def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>; - def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; - def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; - def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; - def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; // 128 bit conversions + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; + + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; + + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; + + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; + + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (VREV16q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; } // Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian Index: test/CodeGen/ARM/fp16-vector-argument.ll =================================================================== --- test/CodeGen/ARM/fp16-vector-argument.ll +++ test/CodeGen/ARM/fp16-vector-argument.ll @@ -3,10 +3,10 @@ ; RUN: llc -mtriple=armv8a -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=soft -o - %s | FileCheck %s --check-prefix=SOFTA ; RUN: llc -mtriple=armv8a -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=apcs-gnu -float-abi=hard -o - %s | FileCheck %s --check-prefix=HARD ; RUN: llc -mtriple=armv8a -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=hard -o - %s | FileCheck %s --check-prefix=HARDA -; RUNTOADD: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=apcs-gnu -float-abi=soft -o - %s | FileCheck %s --check-prefix=SOFTEB -; RUNTOADD: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=soft -o - %s | FileCheck %s --check-prefix=SOFTAEB -; RUNTOADD: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=apcs-gnu -float-abi=hard -o - %s | FileCheck %s --check-prefix=HARDEB -; RUNTOADD: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=hard -o - %s | FileCheck %s --check-prefix=HARDAEB +; RUN: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=apcs-gnu -float-abi=soft -o - %s | FileCheck %s --check-prefix=SOFTEB +; RUN: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=soft -o - %s | FileCheck %s --check-prefix=SOFTAEB +; RUN: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=apcs-gnu -float-abi=hard -o - %s | FileCheck %s --check-prefix=HARDEB +; RUN: llc -mtriple=armeb-eabi -mattr=+armv8.2-a,+fullfp16,+neon -target-abi=aapcs-gnu -float-abi=hard -o - %s | FileCheck %s --check-prefix=HARDAEB declare <4 x half> @llvm.fabs.v4f16(<4 x half>) declare <8 x half> @llvm.fabs.v8f16(<8 x half>) Index: test/CodeGen/ARM/fp16-vector-basic-instructions.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/fp16-vector-basic-instructions.ll @@ -0,0 +1,1840 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -o - -mtriple=arm-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=hard -target-abi=apcs-gnu %s | FileCheck %s --check-prefix=HARD +; RUN: llc -o - -mtriple=arm-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=soft -target-abi=apcs-gnu %s | FileCheck %s --check-prefix=SOFT +; RUN: llc -o - -mtriple=arm-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=hard -target-abi=aapcs-gnu %s | FileCheck %s --check-prefix=HARDA +; RUN: llc -o - -mtriple=arm-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=soft -target-abi=aapcs-gnu %s | FileCheck %s --check-prefix=SOFTA +; RUN: llc -o - -mtriple=armeb-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=hard -target-abi=apcs-gnu %s | FileCheck %s --check-prefix=HARDBE +; RUN: llc -o - -mtriple=armeb-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=soft -target-abi=apcs-gnu %s | FileCheck %s --check-prefix=SOFTFPBE +; RUN: llc -o - -mtriple=armeb-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=hard -target-abi=aapcs-gnu %s | FileCheck %s --check-prefix=HARDBEA +; RUN: llc -o - -mtriple=armeb-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=soft -target-abi=aapcs-gnu %s | FileCheck %s --check-prefix=SOFTFPBEA + +define i16 @bitcast_4xhalf_4xi16(<4 x half> %c) { +; HARD-LABEL: bitcast_4xhalf_4xi16: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vdup.32 d16, d16[3] +; HARD-NEXT: vmov.u16 r0, d16[0] +; HARD-NEXT: bx lr +; +; SOFT-LABEL: bitcast_4xhalf_4xi16: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vdup.32 d16, d16[3] +; SOFT-NEXT: vmov.u16 r0, d16[0] +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: bitcast_4xhalf_4xi16: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vdup.32 d16, d0[3] +; HARDA-NEXT: vmov.u16 r0, d16[0] +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: bitcast_4xhalf_4xi16: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vdup.32 d16, d16[3] +; SOFTA-NEXT: vmov.u16 r0, d16[0] +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: bitcast_4xhalf_4xi16: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.16 d16, d16 +; HARDBE-NEXT: vdup.32 d16, d16[3] +; HARDBE-NEXT: vmov.u16 r0, d16[0] +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: bitcast_4xhalf_4xi16: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.16 d16, d16 +; SOFTFPBE-NEXT: vdup.32 d16, d16[3] +; SOFTFPBE-NEXT: vmov.u16 r0, d16[0] +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: bitcast_4xhalf_4xi16: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.16 d16, d0 +; HARDBEA-NEXT: vdup.32 d16, d16[3] +; HARDBEA-NEXT: vmov.u16 r0, d16[0] +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: bitcast_4xhalf_4xi16: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.16 d16, d16 +; SOFTFPBEA-NEXT: vdup.32 d16, d16[3] +; SOFTFPBEA-NEXT: vmov.u16 r0, d16[0] +; SOFTFPBEA-NEXT: bx lr +entry: + %shuffle = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> + %0 = bitcast <4 x half> %shuffle to <4 x i16> + %vget_lane = extractelement <4 x i16> %0, i32 0 + ret i16 %vget_lane +} + +define <4 x half> @bitcast_8xi16_8xhalf(<8 x i16> %c) { +; HARD-LABEL: bitcast_8xi16_8xhalf: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vorr d18, d16, d16 +; HARD-NEXT: vtrn.16 d16, d18 +; HARD-NEXT: vext.16 d16, d18, d18, #1 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: bitcast_8xi16_8xhalf: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vorr d18, d16, d16 +; SOFT-NEXT: vtrn.16 d16, d18 +; SOFT-NEXT: vext.16 d16, d18, d18, #1 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: bitcast_8xi16_8xhalf: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vorr d16, d0, d0 +; HARDA-NEXT: vtrn.16 d0, d16 +; HARDA-NEXT: vext.16 d0, d16, d16, #1 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: bitcast_8xi16_8xhalf: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vorr d18, d16, d16 +; SOFTA-NEXT: vtrn.16 d16, d18 +; SOFTA-NEXT: vext.16 d16, d18, d18, #1 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: bitcast_8xi16_8xhalf: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.16 q8, q8 +; HARDBE-NEXT: vorr d18, d16, d16 +; HARDBE-NEXT: vtrn.16 d16, d18 +; HARDBE-NEXT: vext.16 d16, d18, d18, #1 +; HARDBE-NEXT: vrev64.16 d16, d16 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: bitcast_8xi16_8xhalf: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.16 q8, q8 +; SOFTFPBE-NEXT: vorr d18, d16, d16 +; SOFTFPBE-NEXT: vtrn.16 d16, d18 +; SOFTFPBE-NEXT: vext.16 d16, d18, d18, #1 +; SOFTFPBE-NEXT: vrev64.16 d16, d16 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: bitcast_8xi16_8xhalf: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.16 q8, q0 +; HARDBEA-NEXT: vorr d18, d16, d16 +; HARDBEA-NEXT: vtrn.16 d16, d18 +; HARDBEA-NEXT: vext.16 d16, d18, d18, #1 +; HARDBEA-NEXT: vrev64.16 d0, d16 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: bitcast_8xi16_8xhalf: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.16 q8, q8 +; SOFTFPBEA-NEXT: vorr d18, d16, d16 +; SOFTFPBEA-NEXT: vtrn.16 d16, d18 +; SOFTFPBEA-NEXT: vext.16 d16, d18, d18, #1 +; SOFTFPBEA-NEXT: vrev64.16 d16, d16 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: bx lr +entry: + %0 = bitcast <8 x i16> %c to <8 x half> + %shuffle = shufflevector <8 x half> %0, <8 x half> undef, <4 x i32> + ret <4 x half> %shuffle +} + +define <4 x half> @cast1(<1 x i64> %a) { +; HARD-LABEL: cast1: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vldr d17, .LCPI2_0 +; HARD-NEXT: vadd.i8 d16, d16, d16 +; HARD-NEXT: vtbl.8 d16, {d16}, d17 +; HARD-NEXT: vadd.f16 d16, d16, d16 +; HARD-NEXT: vorr d18, d16, d16 +; HARD-NEXT: vdup.32 d17, d16[3] +; HARD-NEXT: vuzp.16 d16, d18 +; HARD-NEXT: vtrn.16 d16, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: bx lr +; HARD-NEXT: .p2align 3 +; HARD-NEXT: @ %bb.1: +; HARD-NEXT: .LCPI2_0: +; HARD-NEXT: .byte 1 @ 0x1 +; HARD-NEXT: .byte 7 @ 0x7 +; HARD-NEXT: .byte 0 @ 0x0 +; HARD-NEXT: .byte 7 @ 0x7 +; HARD-NEXT: .byte 0 @ 0x0 +; HARD-NEXT: .byte 0 @ 0x0 +; HARD-NEXT: .byte 0 @ 0x0 +; HARD-NEXT: .byte 0 @ 0x0 +; +; SOFT-LABEL: cast1: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vldr d17, .LCPI2_0 +; SOFT-NEXT: vadd.i8 d16, d16, d16 +; SOFT-NEXT: vtbl.8 d16, {d16}, d17 +; SOFT-NEXT: vadd.f16 d16, d16, d16 +; SOFT-NEXT: vorr d18, d16, d16 +; SOFT-NEXT: vdup.32 d17, d16[3] +; SOFT-NEXT: vuzp.16 d16, d18 +; SOFT-NEXT: vtrn.16 d16, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: bx lr +; SOFT-NEXT: .p2align 3 +; SOFT-NEXT: @ %bb.1: +; SOFT-NEXT: .LCPI2_0: +; SOFT-NEXT: .byte 1 @ 0x1 +; SOFT-NEXT: .byte 7 @ 0x7 +; SOFT-NEXT: .byte 0 @ 0x0 +; SOFT-NEXT: .byte 7 @ 0x7 +; SOFT-NEXT: .byte 0 @ 0x0 +; SOFT-NEXT: .byte 0 @ 0x0 +; SOFT-NEXT: .byte 0 @ 0x0 +; SOFT-NEXT: .byte 0 @ 0x0 +; +; HARDA-LABEL: cast1: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vldr d16, .LCPI2_0 +; HARDA-NEXT: vadd.i8 d17, d0, d0 +; HARDA-NEXT: vtbl.8 d16, {d17}, d16 +; HARDA-NEXT: vadd.f16 d0, d16, d16 +; HARDA-NEXT: vorr d17, d0, d0 +; HARDA-NEXT: vdup.32 d16, d0[3] +; HARDA-NEXT: vuzp.16 d0, d17 +; HARDA-NEXT: vtrn.16 d0, d16 +; HARDA-NEXT: bx lr +; HARDA-NEXT: .p2align 3 +; HARDA-NEXT: @ %bb.1: +; HARDA-NEXT: .LCPI2_0: +; HARDA-NEXT: .byte 1 @ 0x1 +; HARDA-NEXT: .byte 7 @ 0x7 +; HARDA-NEXT: .byte 0 @ 0x0 +; HARDA-NEXT: .byte 7 @ 0x7 +; HARDA-NEXT: .byte 0 @ 0x0 +; HARDA-NEXT: .byte 0 @ 0x0 +; HARDA-NEXT: .byte 0 @ 0x0 +; HARDA-NEXT: .byte 0 @ 0x0 +; +; SOFTA-LABEL: cast1: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vldr d17, .LCPI2_0 +; SOFTA-NEXT: vadd.i8 d16, d16, d16 +; SOFTA-NEXT: vtbl.8 d16, {d16}, d17 +; SOFTA-NEXT: vadd.f16 d16, d16, d16 +; SOFTA-NEXT: vorr d18, d16, d16 +; SOFTA-NEXT: vdup.32 d17, d16[3] +; SOFTA-NEXT: vuzp.16 d16, d18 +; SOFTA-NEXT: vtrn.16 d16, d17 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: bx lr +; SOFTA-NEXT: .p2align 3 +; SOFTA-NEXT: @ %bb.1: +; SOFTA-NEXT: .LCPI2_0: +; SOFTA-NEXT: .byte 1 @ 0x1 +; SOFTA-NEXT: .byte 7 @ 0x7 +; SOFTA-NEXT: .byte 0 @ 0x0 +; SOFTA-NEXT: .byte 7 @ 0x7 +; SOFTA-NEXT: .byte 0 @ 0x0 +; SOFTA-NEXT: .byte 0 @ 0x0 +; SOFTA-NEXT: .byte 0 @ 0x0 +; SOFTA-NEXT: .byte 0 @ 0x0 +; +; HARDBE-LABEL: cast1: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vldr d17, .LCPI2_0 +; HARDBE-NEXT: vrev64.8 d16, d16 +; HARDBE-NEXT: vrev64.8 d17, d17 +; HARDBE-NEXT: vadd.i8 d16, d16, d16 +; HARDBE-NEXT: vtbl.8 d16, {d16}, d17 +; HARDBE-NEXT: vrev16.8 d16, d16 +; HARDBE-NEXT: vadd.f16 d16, d16, d16 +; HARDBE-NEXT: vorr d18, d16, d16 +; HARDBE-NEXT: vdup.32 d17, d16[3] +; HARDBE-NEXT: vuzp.16 d16, d18 +; HARDBE-NEXT: vtrn.16 d16, d17 +; HARDBE-NEXT: vrev64.16 d16, d16 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: bx lr +; HARDBE-NEXT: .p2align 3 +; HARDBE-NEXT: @ %bb.1: +; HARDBE-NEXT: .LCPI2_0: +; HARDBE-NEXT: .byte 1 @ 0x1 +; HARDBE-NEXT: .byte 7 @ 0x7 +; HARDBE-NEXT: .byte 0 @ 0x0 +; HARDBE-NEXT: .byte 7 @ 0x7 +; HARDBE-NEXT: .byte 0 @ 0x0 +; HARDBE-NEXT: .byte 0 @ 0x0 +; HARDBE-NEXT: .byte 0 @ 0x0 +; HARDBE-NEXT: .byte 0 @ 0x0 +; +; SOFTFPBE-LABEL: cast1: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vldr d17, .LCPI2_0 +; SOFTFPBE-NEXT: vrev64.8 d16, d16 +; SOFTFPBE-NEXT: vrev64.8 d17, d17 +; SOFTFPBE-NEXT: vadd.i8 d16, d16, d16 +; SOFTFPBE-NEXT: vtbl.8 d16, {d16}, d17 +; SOFTFPBE-NEXT: vrev16.8 d16, d16 +; SOFTFPBE-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBE-NEXT: vorr d18, d16, d16 +; SOFTFPBE-NEXT: vdup.32 d17, d16[3] +; SOFTFPBE-NEXT: vuzp.16 d16, d18 +; SOFTFPBE-NEXT: vtrn.16 d16, d17 +; SOFTFPBE-NEXT: vrev64.16 d16, d16 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: bx lr +; SOFTFPBE-NEXT: .p2align 3 +; SOFTFPBE-NEXT: @ %bb.1: +; SOFTFPBE-NEXT: .LCPI2_0: +; SOFTFPBE-NEXT: .byte 1 @ 0x1 +; SOFTFPBE-NEXT: .byte 7 @ 0x7 +; SOFTFPBE-NEXT: .byte 0 @ 0x0 +; SOFTFPBE-NEXT: .byte 7 @ 0x7 +; SOFTFPBE-NEXT: .byte 0 @ 0x0 +; SOFTFPBE-NEXT: .byte 0 @ 0x0 +; SOFTFPBE-NEXT: .byte 0 @ 0x0 +; SOFTFPBE-NEXT: .byte 0 @ 0x0 +; +; HARDBEA-LABEL: cast1: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vldr d16, .LCPI2_0 +; HARDBEA-NEXT: vrev64.8 d17, d0 +; HARDBEA-NEXT: vrev64.8 d16, d16 +; HARDBEA-NEXT: vadd.i8 d17, d17, d17 +; HARDBEA-NEXT: vtbl.8 d16, {d17}, d16 +; HARDBEA-NEXT: vrev16.8 d16, d16 +; HARDBEA-NEXT: vadd.f16 d16, d16, d16 +; HARDBEA-NEXT: vorr d18, d16, d16 +; HARDBEA-NEXT: vdup.32 d17, d16[3] +; HARDBEA-NEXT: vuzp.16 d16, d18 +; HARDBEA-NEXT: vtrn.16 d16, d17 +; HARDBEA-NEXT: vrev64.16 d0, d16 +; HARDBEA-NEXT: bx lr +; HARDBEA-NEXT: .p2align 3 +; HARDBEA-NEXT: @ %bb.1: +; HARDBEA-NEXT: .LCPI2_0: +; HARDBEA-NEXT: .byte 1 @ 0x1 +; HARDBEA-NEXT: .byte 7 @ 0x7 +; HARDBEA-NEXT: .byte 0 @ 0x0 +; HARDBEA-NEXT: .byte 7 @ 0x7 +; HARDBEA-NEXT: .byte 0 @ 0x0 +; HARDBEA-NEXT: .byte 0 @ 0x0 +; HARDBEA-NEXT: .byte 0 @ 0x0 +; HARDBEA-NEXT: .byte 0 @ 0x0 +; +; SOFTFPBEA-LABEL: cast1: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vldr d17, .LCPI2_0 +; SOFTFPBEA-NEXT: vrev64.8 d16, d16 +; SOFTFPBEA-NEXT: vrev64.8 d17, d17 +; SOFTFPBEA-NEXT: vadd.i8 d16, d16, d16 +; SOFTFPBEA-NEXT: vtbl.8 d16, {d16}, d17 +; SOFTFPBEA-NEXT: vrev16.8 d16, d16 +; SOFTFPBEA-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBEA-NEXT: vorr d18, d16, d16 +; SOFTFPBEA-NEXT: vdup.32 d17, d16[3] +; SOFTFPBEA-NEXT: vuzp.16 d16, d18 +; SOFTFPBEA-NEXT: vtrn.16 d16, d17 +; SOFTFPBEA-NEXT: vrev64.16 d16, d16 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: bx lr +; SOFTFPBEA-NEXT: .p2align 3 +; SOFTFPBEA-NEXT: @ %bb.1: +; SOFTFPBEA-NEXT: .LCPI2_0: +; SOFTFPBEA-NEXT: .byte 1 @ 0x1 +; SOFTFPBEA-NEXT: .byte 7 @ 0x7 +; SOFTFPBEA-NEXT: .byte 0 @ 0x0 +; SOFTFPBEA-NEXT: .byte 7 @ 0x7 +; SOFTFPBEA-NEXT: .byte 0 @ 0x0 +; SOFTFPBEA-NEXT: .byte 0 @ 0x0 +; SOFTFPBEA-NEXT: .byte 0 @ 0x0 +; SOFTFPBEA-NEXT: .byte 0 @ 0x0 +entry: + %v1 = bitcast <1 x i64> %a to <8 x i8> + %add1 = add <8 x i8> %v1, %v1 + %shuffle = shufflevector <8 x i8> %add1, <8 x i8> %add1, <8 x i32> + %v4f16 = bitcast <8 x i8> %shuffle to <4 x half> + %add2 = fadd <4 x half> %v4f16, %v4f16 + %r = shufflevector <4 x half> %add2, <4 x half> %add2, <4 x i32> + ret <4 x half> %r +} + +define <4 x half> @cast2(<1 x i64> %a) { +; HARD-LABEL: cast2: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.i16 d16, d16, d16 +; HARD-NEXT: vdup.16 d17, d16[3] +; HARD-NEXT: vzip.16 d16, d17 +; HARD-NEXT: vext.16 d16, d16, d16, #2 +; HARD-NEXT: vadd.f16 d16, d16, d16 +; HARD-NEXT: vorr d18, d16, d16 +; HARD-NEXT: vdup.32 d17, d16[3] +; HARD-NEXT: vuzp.16 d16, d18 +; HARD-NEXT: vtrn.16 d16, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast2: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.i16 d16, d16, d16 +; SOFT-NEXT: vdup.16 d17, d16[3] +; SOFT-NEXT: vzip.16 d16, d17 +; SOFT-NEXT: vext.16 d16, d16, d16, #2 +; SOFT-NEXT: vadd.f16 d16, d16, d16 +; SOFT-NEXT: vorr d18, d16, d16 +; SOFT-NEXT: vdup.32 d17, d16[3] +; SOFT-NEXT: vuzp.16 d16, d18 +; SOFT-NEXT: vtrn.16 d16, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast2: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.i16 d16, d0, d0 +; HARDA-NEXT: vdup.16 d17, d16[3] +; HARDA-NEXT: vzip.16 d16, d17 +; HARDA-NEXT: vext.16 d16, d16, d16, #2 +; HARDA-NEXT: vadd.f16 d0, d16, d16 +; HARDA-NEXT: vorr d17, d0, d0 +; HARDA-NEXT: vdup.32 d16, d0[3] +; HARDA-NEXT: vuzp.16 d0, d17 +; HARDA-NEXT: vtrn.16 d0, d16 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast2: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.i16 d16, d16, d16 +; SOFTA-NEXT: vdup.16 d17, d16[3] +; SOFTA-NEXT: vzip.16 d16, d17 +; SOFTA-NEXT: vext.16 d16, d16, d16, #2 +; SOFTA-NEXT: vadd.f16 d16, d16, d16 +; SOFTA-NEXT: vorr d18, d16, d16 +; SOFTA-NEXT: vdup.32 d17, d16[3] +; SOFTA-NEXT: vuzp.16 d16, d18 +; SOFTA-NEXT: vtrn.16 d16, d17 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast2: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.16 d16, d16 +; HARDBE-NEXT: vadd.i16 d16, d16, d16 +; HARDBE-NEXT: vdup.16 d17, d16[3] +; HARDBE-NEXT: vzip.16 d16, d17 +; HARDBE-NEXT: vext.16 d16, d16, d16, #2 +; HARDBE-NEXT: vadd.f16 d16, d16, d16 +; HARDBE-NEXT: vorr d18, d16, d16 +; HARDBE-NEXT: vdup.32 d17, d16[3] +; HARDBE-NEXT: vuzp.16 d16, d18 +; HARDBE-NEXT: vtrn.16 d16, d17 +; HARDBE-NEXT: vrev64.16 d16, d16 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast2: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.16 d16, d16 +; SOFTFPBE-NEXT: vadd.i16 d16, d16, d16 +; SOFTFPBE-NEXT: vdup.16 d17, d16[3] +; SOFTFPBE-NEXT: vzip.16 d16, d17 +; SOFTFPBE-NEXT: vext.16 d16, d16, d16, #2 +; SOFTFPBE-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBE-NEXT: vorr d18, d16, d16 +; SOFTFPBE-NEXT: vdup.32 d17, d16[3] +; SOFTFPBE-NEXT: vuzp.16 d16, d18 +; SOFTFPBE-NEXT: vtrn.16 d16, d17 +; SOFTFPBE-NEXT: vrev64.16 d16, d16 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast2: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.16 d16, d0 +; HARDBEA-NEXT: vadd.i16 d16, d16, d16 +; HARDBEA-NEXT: vdup.16 d17, d16[3] +; HARDBEA-NEXT: vzip.16 d16, d17 +; HARDBEA-NEXT: vext.16 d16, d16, d16, #2 +; HARDBEA-NEXT: vadd.f16 d16, d16, d16 +; HARDBEA-NEXT: vorr d18, d16, d16 +; HARDBEA-NEXT: vdup.32 d17, d16[3] +; HARDBEA-NEXT: vuzp.16 d16, d18 +; HARDBEA-NEXT: vtrn.16 d16, d17 +; HARDBEA-NEXT: vrev64.16 d0, d16 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast2: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.16 d16, d16 +; SOFTFPBEA-NEXT: vadd.i16 d16, d16, d16 +; SOFTFPBEA-NEXT: vdup.16 d17, d16[3] +; SOFTFPBEA-NEXT: vzip.16 d16, d17 +; SOFTFPBEA-NEXT: vext.16 d16, d16, d16, #2 +; SOFTFPBEA-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBEA-NEXT: vorr d18, d16, d16 +; SOFTFPBEA-NEXT: vdup.32 d17, d16[3] +; SOFTFPBEA-NEXT: vuzp.16 d16, d18 +; SOFTFPBEA-NEXT: vtrn.16 d16, d17 +; SOFTFPBEA-NEXT: vrev64.16 d16, d16 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: bx lr +entry: + %v1 = bitcast <1 x i64> %a to <4 x i16> + %add1 = add <4 x i16> %v1, %v1 + %shuffle = shufflevector <4 x i16> %add1, <4 x i16> %add1, <4 x i32> + %v4f16 = bitcast <4 x i16> %shuffle to <4 x half> + %add2 = fadd <4 x half> %v4f16, %v4f16 + %r = shufflevector <4 x half> %add2, <4 x half> %add2, <4 x i32> + ret <4 x half> %r +} + +define <4 x half> @cast3(<1 x i64> %a) { +; HARD-LABEL: cast3: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.f32 d16, d16, d16 +; HARD-NEXT: vrev64.32 d16, d16 +; HARD-NEXT: vadd.f16 d16, d16, d16 +; HARD-NEXT: vorr d18, d16, d16 +; HARD-NEXT: vdup.32 d17, d16[3] +; HARD-NEXT: vuzp.16 d16, d18 +; HARD-NEXT: vtrn.16 d16, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast3: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.f32 d16, d16, d16 +; SOFT-NEXT: vrev64.32 d16, d16 +; SOFT-NEXT: vadd.f16 d16, d16, d16 +; SOFT-NEXT: vorr d18, d16, d16 +; SOFT-NEXT: vdup.32 d17, d16[3] +; SOFT-NEXT: vuzp.16 d16, d18 +; SOFT-NEXT: vtrn.16 d16, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast3: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.f32 d16, d0, d0 +; HARDA-NEXT: vrev64.32 d16, d16 +; HARDA-NEXT: vadd.f16 d0, d16, d16 +; HARDA-NEXT: vorr d17, d0, d0 +; HARDA-NEXT: vdup.32 d16, d0[3] +; HARDA-NEXT: vuzp.16 d0, d17 +; HARDA-NEXT: vtrn.16 d0, d16 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast3: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.f32 d16, d16, d16 +; SOFTA-NEXT: vrev64.32 d16, d16 +; SOFTA-NEXT: vadd.f16 d16, d16, d16 +; SOFTA-NEXT: vorr d18, d16, d16 +; SOFTA-NEXT: vdup.32 d17, d16[3] +; SOFTA-NEXT: vuzp.16 d16, d18 +; SOFTA-NEXT: vtrn.16 d16, d17 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast3: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.32 d16, d16 +; HARDBE-NEXT: vadd.f32 d16, d16, d16 +; HARDBE-NEXT: vrev64.32 d16, d16 +; HARDBE-NEXT: vrev32.16 d16, d16 +; HARDBE-NEXT: vadd.f16 d16, d16, d16 +; HARDBE-NEXT: vorr d18, d16, d16 +; HARDBE-NEXT: vdup.32 d17, d16[3] +; HARDBE-NEXT: vuzp.16 d16, d18 +; HARDBE-NEXT: vtrn.16 d16, d17 +; HARDBE-NEXT: vrev64.16 d16, d16 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast3: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.32 d16, d16 +; SOFTFPBE-NEXT: vadd.f32 d16, d16, d16 +; SOFTFPBE-NEXT: vrev64.32 d16, d16 +; SOFTFPBE-NEXT: vrev32.16 d16, d16 +; SOFTFPBE-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBE-NEXT: vorr d18, d16, d16 +; SOFTFPBE-NEXT: vdup.32 d17, d16[3] +; SOFTFPBE-NEXT: vuzp.16 d16, d18 +; SOFTFPBE-NEXT: vtrn.16 d16, d17 +; SOFTFPBE-NEXT: vrev64.16 d16, d16 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast3: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.32 d16, d0 +; HARDBEA-NEXT: vadd.f32 d16, d16, d16 +; HARDBEA-NEXT: vrev64.32 d16, d16 +; HARDBEA-NEXT: vrev32.16 d16, d16 +; HARDBEA-NEXT: vadd.f16 d16, d16, d16 +; HARDBEA-NEXT: vorr d18, d16, d16 +; HARDBEA-NEXT: vdup.32 d17, d16[3] +; HARDBEA-NEXT: vuzp.16 d16, d18 +; HARDBEA-NEXT: vtrn.16 d16, d17 +; HARDBEA-NEXT: vrev64.16 d0, d16 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast3: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.32 d16, d16 +; SOFTFPBEA-NEXT: vadd.f32 d16, d16, d16 +; SOFTFPBEA-NEXT: vrev64.32 d16, d16 +; SOFTFPBEA-NEXT: vrev32.16 d16, d16 +; SOFTFPBEA-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBEA-NEXT: vorr d18, d16, d16 +; SOFTFPBEA-NEXT: vdup.32 d17, d16[3] +; SOFTFPBEA-NEXT: vuzp.16 d16, d18 +; SOFTFPBEA-NEXT: vtrn.16 d16, d17 +; SOFTFPBEA-NEXT: vrev64.16 d16, d16 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: bx lr +entry: + %v1 = bitcast <1 x i64> %a to <2 x float> + %add1 = fadd <2 x float> %v1, %v1 + %shuffle = shufflevector <2 x float> %add1, <2 x float> %add1, <2 x i32> + %v4f16 = bitcast <2 x float> %shuffle to <4 x half> + %add2 = fadd <4 x half> %v4f16, %v4f16 + %r = shufflevector <4 x half> %add2, <4 x half> %add2, <4 x i32> + ret <4 x half> %r +} + +define <4 x half> @cast4(<1 x i64> %a) { +; HARD-LABEL: cast4: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.i32 d16, d16, d16 +; HARD-NEXT: vrev64.32 d16, d16 +; HARD-NEXT: vadd.f16 d16, d16, d16 +; HARD-NEXT: vorr d18, d16, d16 +; HARD-NEXT: vdup.32 d17, d16[3] +; HARD-NEXT: vuzp.16 d16, d18 +; HARD-NEXT: vtrn.16 d16, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast4: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.i32 d16, d16, d16 +; SOFT-NEXT: vrev64.32 d16, d16 +; SOFT-NEXT: vadd.f16 d16, d16, d16 +; SOFT-NEXT: vorr d18, d16, d16 +; SOFT-NEXT: vdup.32 d17, d16[3] +; SOFT-NEXT: vuzp.16 d16, d18 +; SOFT-NEXT: vtrn.16 d16, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast4: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.i32 d16, d0, d0 +; HARDA-NEXT: vrev64.32 d16, d16 +; HARDA-NEXT: vadd.f16 d0, d16, d16 +; HARDA-NEXT: vorr d17, d0, d0 +; HARDA-NEXT: vdup.32 d16, d0[3] +; HARDA-NEXT: vuzp.16 d0, d17 +; HARDA-NEXT: vtrn.16 d0, d16 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast4: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.i32 d16, d16, d16 +; SOFTA-NEXT: vrev64.32 d16, d16 +; SOFTA-NEXT: vadd.f16 d16, d16, d16 +; SOFTA-NEXT: vorr d18, d16, d16 +; SOFTA-NEXT: vdup.32 d17, d16[3] +; SOFTA-NEXT: vuzp.16 d16, d18 +; SOFTA-NEXT: vtrn.16 d16, d17 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast4: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.32 d16, d16 +; HARDBE-NEXT: vadd.i32 d16, d16, d16 +; HARDBE-NEXT: vrev64.32 d16, d16 +; HARDBE-NEXT: vrev32.16 d16, d16 +; HARDBE-NEXT: vadd.f16 d16, d16, d16 +; HARDBE-NEXT: vorr d18, d16, d16 +; HARDBE-NEXT: vdup.32 d17, d16[3] +; HARDBE-NEXT: vuzp.16 d16, d18 +; HARDBE-NEXT: vtrn.16 d16, d17 +; HARDBE-NEXT: vrev64.16 d16, d16 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast4: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.32 d16, d16 +; SOFTFPBE-NEXT: vadd.i32 d16, d16, d16 +; SOFTFPBE-NEXT: vrev64.32 d16, d16 +; SOFTFPBE-NEXT: vrev32.16 d16, d16 +; SOFTFPBE-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBE-NEXT: vorr d18, d16, d16 +; SOFTFPBE-NEXT: vdup.32 d17, d16[3] +; SOFTFPBE-NEXT: vuzp.16 d16, d18 +; SOFTFPBE-NEXT: vtrn.16 d16, d17 +; SOFTFPBE-NEXT: vrev64.16 d16, d16 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast4: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.32 d16, d0 +; HARDBEA-NEXT: vadd.i32 d16, d16, d16 +; HARDBEA-NEXT: vrev64.32 d16, d16 +; HARDBEA-NEXT: vrev32.16 d16, d16 +; HARDBEA-NEXT: vadd.f16 d16, d16, d16 +; HARDBEA-NEXT: vorr d18, d16, d16 +; HARDBEA-NEXT: vdup.32 d17, d16[3] +; HARDBEA-NEXT: vuzp.16 d16, d18 +; HARDBEA-NEXT: vtrn.16 d16, d17 +; HARDBEA-NEXT: vrev64.16 d0, d16 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast4: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.32 d16, d16 +; SOFTFPBEA-NEXT: vadd.i32 d16, d16, d16 +; SOFTFPBEA-NEXT: vrev64.32 d16, d16 +; SOFTFPBEA-NEXT: vrev32.16 d16, d16 +; SOFTFPBEA-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBEA-NEXT: vorr d18, d16, d16 +; SOFTFPBEA-NEXT: vdup.32 d17, d16[3] +; SOFTFPBEA-NEXT: vuzp.16 d16, d18 +; SOFTFPBEA-NEXT: vtrn.16 d16, d17 +; SOFTFPBEA-NEXT: vrev64.16 d16, d16 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: bx lr +entry: + %v1 = bitcast <1 x i64> %a to <2 x i32> + %add1 = add <2 x i32> %v1, %v1 + %shuffle = shufflevector <2 x i32> %add1, <2 x i32> %add1, <2 x i32> + %v4f16 = bitcast <2 x i32> %shuffle to <4 x half> + %add2 = fadd <4 x half> %v4f16, %v4f16 + %r = shufflevector <4 x half> %add2, <4 x half> %add2, <4 x i32> + ret <4 x half> %r +} + +define <8 x i8> @cast5( <4 x half> %a) { +; HARD-LABEL: cast5: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.f16 d16, d16, d16 +; HARD-NEXT: vorr d17, d16, d16 +; HARD-NEXT: vorr d18, d16, d16 +; HARD-NEXT: vuzp.16 d17, d18 +; HARD-NEXT: vldr d17, .LCPI6_0 +; HARD-NEXT: vext.16 d16, d18, d16, #2 +; HARD-NEXT: vadd.i8 d16, d16, d16 +; HARD-NEXT: vtbl.8 d16, {d16}, d17 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: bx lr +; HARD-NEXT: .p2align 3 +; HARD-NEXT: @ %bb.1: +; HARD-NEXT: .LCPI6_0: +; HARD-NEXT: .byte 0 @ 0x0 +; HARD-NEXT: .byte 7 @ 0x7 +; HARD-NEXT: .byte 0 @ 0x0 +; HARD-NEXT: .byte 7 @ 0x7 +; HARD-NEXT: .byte 3 @ 0x3 +; HARD-NEXT: .byte 1 @ 0x1 +; HARD-NEXT: .byte 0 @ 0x0 +; HARD-NEXT: .byte 5 @ 0x5 +; +; SOFT-LABEL: cast5: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.f16 d16, d16, d16 +; SOFT-NEXT: vorr d17, d16, d16 +; SOFT-NEXT: vorr d18, d16, d16 +; SOFT-NEXT: vuzp.16 d17, d18 +; SOFT-NEXT: vldr d17, .LCPI6_0 +; SOFT-NEXT: vext.16 d16, d18, d16, #2 +; SOFT-NEXT: vadd.i8 d16, d16, d16 +; SOFT-NEXT: vtbl.8 d16, {d16}, d17 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: bx lr +; SOFT-NEXT: .p2align 3 +; SOFT-NEXT: @ %bb.1: +; SOFT-NEXT: .LCPI6_0: +; SOFT-NEXT: .byte 0 @ 0x0 +; SOFT-NEXT: .byte 7 @ 0x7 +; SOFT-NEXT: .byte 0 @ 0x0 +; SOFT-NEXT: .byte 7 @ 0x7 +; SOFT-NEXT: .byte 3 @ 0x3 +; SOFT-NEXT: .byte 1 @ 0x1 +; SOFT-NEXT: .byte 0 @ 0x0 +; SOFT-NEXT: .byte 5 @ 0x5 +; +; HARDA-LABEL: cast5: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.f16 d16, d0, d0 +; HARDA-NEXT: vorr d17, d16, d16 +; HARDA-NEXT: vorr d18, d16, d16 +; HARDA-NEXT: vuzp.16 d17, d18 +; HARDA-NEXT: vldr d17, .LCPI6_0 +; HARDA-NEXT: vext.16 d16, d18, d16, #2 +; HARDA-NEXT: vadd.i8 d16, d16, d16 +; HARDA-NEXT: vtbl.8 d0, {d16}, d17 +; HARDA-NEXT: bx lr +; HARDA-NEXT: .p2align 3 +; HARDA-NEXT: @ %bb.1: +; HARDA-NEXT: .LCPI6_0: +; HARDA-NEXT: .byte 0 @ 0x0 +; HARDA-NEXT: .byte 7 @ 0x7 +; HARDA-NEXT: .byte 0 @ 0x0 +; HARDA-NEXT: .byte 7 @ 0x7 +; HARDA-NEXT: .byte 3 @ 0x3 +; HARDA-NEXT: .byte 1 @ 0x1 +; HARDA-NEXT: .byte 0 @ 0x0 +; HARDA-NEXT: .byte 5 @ 0x5 +; +; SOFTA-LABEL: cast5: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.f16 d16, d16, d16 +; SOFTA-NEXT: vorr d17, d16, d16 +; SOFTA-NEXT: vorr d18, d16, d16 +; SOFTA-NEXT: vuzp.16 d17, d18 +; SOFTA-NEXT: vldr d17, .LCPI6_0 +; SOFTA-NEXT: vext.16 d16, d18, d16, #2 +; SOFTA-NEXT: vadd.i8 d16, d16, d16 +; SOFTA-NEXT: vtbl.8 d16, {d16}, d17 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: bx lr +; SOFTA-NEXT: .p2align 3 +; SOFTA-NEXT: @ %bb.1: +; SOFTA-NEXT: .LCPI6_0: +; SOFTA-NEXT: .byte 0 @ 0x0 +; SOFTA-NEXT: .byte 7 @ 0x7 +; SOFTA-NEXT: .byte 0 @ 0x0 +; SOFTA-NEXT: .byte 7 @ 0x7 +; SOFTA-NEXT: .byte 3 @ 0x3 +; SOFTA-NEXT: .byte 1 @ 0x1 +; SOFTA-NEXT: .byte 0 @ 0x0 +; SOFTA-NEXT: .byte 5 @ 0x5 +; +; HARDBE-LABEL: cast5: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.16 d16, d16 +; HARDBE-NEXT: vadd.f16 d16, d16, d16 +; HARDBE-NEXT: vorr d17, d16, d16 +; HARDBE-NEXT: vorr d18, d16, d16 +; HARDBE-NEXT: vuzp.16 d17, d18 +; HARDBE-NEXT: vldr d17, .LCPI6_0 +; HARDBE-NEXT: vext.16 d16, d18, d16, #2 +; HARDBE-NEXT: vrev64.8 d17, d17 +; HARDBE-NEXT: vrev16.8 d16, d16 +; HARDBE-NEXT: vadd.i8 d16, d16, d16 +; HARDBE-NEXT: vtbl.8 d16, {d16}, d17 +; HARDBE-NEXT: vrev64.8 d16, d16 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: bx lr +; HARDBE-NEXT: .p2align 3 +; HARDBE-NEXT: @ %bb.1: +; HARDBE-NEXT: .LCPI6_0: +; HARDBE-NEXT: .byte 0 @ 0x0 +; HARDBE-NEXT: .byte 7 @ 0x7 +; HARDBE-NEXT: .byte 0 @ 0x0 +; HARDBE-NEXT: .byte 7 @ 0x7 +; HARDBE-NEXT: .byte 3 @ 0x3 +; HARDBE-NEXT: .byte 1 @ 0x1 +; HARDBE-NEXT: .byte 0 @ 0x0 +; HARDBE-NEXT: .byte 5 @ 0x5 +; +; SOFTFPBE-LABEL: cast5: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.16 d16, d16 +; SOFTFPBE-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBE-NEXT: vorr d17, d16, d16 +; SOFTFPBE-NEXT: vorr d18, d16, d16 +; SOFTFPBE-NEXT: vuzp.16 d17, d18 +; SOFTFPBE-NEXT: vldr d17, .LCPI6_0 +; SOFTFPBE-NEXT: vext.16 d16, d18, d16, #2 +; SOFTFPBE-NEXT: vrev64.8 d17, d17 +; SOFTFPBE-NEXT: vrev16.8 d16, d16 +; SOFTFPBE-NEXT: vadd.i8 d16, d16, d16 +; SOFTFPBE-NEXT: vtbl.8 d16, {d16}, d17 +; SOFTFPBE-NEXT: vrev64.8 d16, d16 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: bx lr +; SOFTFPBE-NEXT: .p2align 3 +; SOFTFPBE-NEXT: @ %bb.1: +; SOFTFPBE-NEXT: .LCPI6_0: +; SOFTFPBE-NEXT: .byte 0 @ 0x0 +; SOFTFPBE-NEXT: .byte 7 @ 0x7 +; SOFTFPBE-NEXT: .byte 0 @ 0x0 +; SOFTFPBE-NEXT: .byte 7 @ 0x7 +; SOFTFPBE-NEXT: .byte 3 @ 0x3 +; SOFTFPBE-NEXT: .byte 1 @ 0x1 +; SOFTFPBE-NEXT: .byte 0 @ 0x0 +; SOFTFPBE-NEXT: .byte 5 @ 0x5 +; +; HARDBEA-LABEL: cast5: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.16 d16, d0 +; HARDBEA-NEXT: vadd.f16 d16, d16, d16 +; HARDBEA-NEXT: vorr d17, d16, d16 +; HARDBEA-NEXT: vorr d18, d16, d16 +; HARDBEA-NEXT: vuzp.16 d17, d18 +; HARDBEA-NEXT: vldr d17, .LCPI6_0 +; HARDBEA-NEXT: vext.16 d16, d18, d16, #2 +; HARDBEA-NEXT: vrev64.8 d17, d17 +; HARDBEA-NEXT: vrev16.8 d16, d16 +; HARDBEA-NEXT: vadd.i8 d16, d16, d16 +; HARDBEA-NEXT: vtbl.8 d16, {d16}, d17 +; HARDBEA-NEXT: vrev64.8 d0, d16 +; HARDBEA-NEXT: bx lr +; HARDBEA-NEXT: .p2align 3 +; HARDBEA-NEXT: @ %bb.1: +; HARDBEA-NEXT: .LCPI6_0: +; HARDBEA-NEXT: .byte 0 @ 0x0 +; HARDBEA-NEXT: .byte 7 @ 0x7 +; HARDBEA-NEXT: .byte 0 @ 0x0 +; HARDBEA-NEXT: .byte 7 @ 0x7 +; HARDBEA-NEXT: .byte 3 @ 0x3 +; HARDBEA-NEXT: .byte 1 @ 0x1 +; HARDBEA-NEXT: .byte 0 @ 0x0 +; HARDBEA-NEXT: .byte 5 @ 0x5 +; +; SOFTFPBEA-LABEL: cast5: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.16 d16, d16 +; SOFTFPBEA-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBEA-NEXT: vorr d17, d16, d16 +; SOFTFPBEA-NEXT: vorr d18, d16, d16 +; SOFTFPBEA-NEXT: vuzp.16 d17, d18 +; SOFTFPBEA-NEXT: vldr d17, .LCPI6_0 +; SOFTFPBEA-NEXT: vext.16 d16, d18, d16, #2 +; SOFTFPBEA-NEXT: vrev64.8 d17, d17 +; SOFTFPBEA-NEXT: vrev16.8 d16, d16 +; SOFTFPBEA-NEXT: vadd.i8 d16, d16, d16 +; SOFTFPBEA-NEXT: vtbl.8 d16, {d16}, d17 +; SOFTFPBEA-NEXT: vrev64.8 d16, d16 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: bx lr +; SOFTFPBEA-NEXT: .p2align 3 +; SOFTFPBEA-NEXT: @ %bb.1: +; SOFTFPBEA-NEXT: .LCPI6_0: +; SOFTFPBEA-NEXT: .byte 0 @ 0x0 +; SOFTFPBEA-NEXT: .byte 7 @ 0x7 +; SOFTFPBEA-NEXT: .byte 0 @ 0x0 +; SOFTFPBEA-NEXT: .byte 7 @ 0x7 +; SOFTFPBEA-NEXT: .byte 3 @ 0x3 +; SOFTFPBEA-NEXT: .byte 1 @ 0x1 +; SOFTFPBEA-NEXT: .byte 0 @ 0x0 +; SOFTFPBEA-NEXT: .byte 5 @ 0x5 +entry: + %b = fadd <4 x half> %a, %a + %c = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> + %d = bitcast <4 x half> %c to <8 x i8> + %e = add <8 x i8> %d, %d + %r = shufflevector <8 x i8> %e, <8 x i8> %e, <8 x i32> + ret <8 x i8> %r +} + +define <2 x i32> @cast6(<4 x half> %a) { +; HARD-LABEL: cast6: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.f16 d16, d16, d16 +; HARD-NEXT: vorr d17, d16, d16 +; HARD-NEXT: vorr d18, d16, d16 +; HARD-NEXT: vuzp.16 d17, d18 +; HARD-NEXT: vext.16 d16, d18, d16, #2 +; HARD-NEXT: vadd.i32 d16, d16, d16 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast6: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.f16 d16, d16, d16 +; SOFT-NEXT: vorr d17, d16, d16 +; SOFT-NEXT: vorr d18, d16, d16 +; SOFT-NEXT: vuzp.16 d17, d18 +; SOFT-NEXT: vext.16 d16, d18, d16, #2 +; SOFT-NEXT: vadd.i32 d16, d16, d16 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast6: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.f16 d16, d0, d0 +; HARDA-NEXT: vorr d17, d16, d16 +; HARDA-NEXT: vorr d18, d16, d16 +; HARDA-NEXT: vuzp.16 d17, d18 +; HARDA-NEXT: vext.16 d16, d18, d16, #2 +; HARDA-NEXT: vadd.i32 d0, d16, d16 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast6: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.f16 d16, d16, d16 +; SOFTA-NEXT: vorr d17, d16, d16 +; SOFTA-NEXT: vorr d18, d16, d16 +; SOFTA-NEXT: vuzp.16 d17, d18 +; SOFTA-NEXT: vext.16 d16, d18, d16, #2 +; SOFTA-NEXT: vadd.i32 d16, d16, d16 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast6: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.16 d16, d16 +; HARDBE-NEXT: vadd.f16 d16, d16, d16 +; HARDBE-NEXT: vorr d17, d16, d16 +; HARDBE-NEXT: vorr d18, d16, d16 +; HARDBE-NEXT: vuzp.16 d17, d18 +; HARDBE-NEXT: vext.16 d16, d18, d16, #2 +; HARDBE-NEXT: vrev32.16 d16, d16 +; HARDBE-NEXT: vadd.i32 d16, d16, d16 +; HARDBE-NEXT: vrev64.32 d16, d16 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast6: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.16 d16, d16 +; SOFTFPBE-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBE-NEXT: vorr d17, d16, d16 +; SOFTFPBE-NEXT: vorr d18, d16, d16 +; SOFTFPBE-NEXT: vuzp.16 d17, d18 +; SOFTFPBE-NEXT: vext.16 d16, d18, d16, #2 +; SOFTFPBE-NEXT: vrev32.16 d16, d16 +; SOFTFPBE-NEXT: vadd.i32 d16, d16, d16 +; SOFTFPBE-NEXT: vrev64.32 d16, d16 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast6: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.16 d16, d0 +; HARDBEA-NEXT: vadd.f16 d16, d16, d16 +; HARDBEA-NEXT: vorr d17, d16, d16 +; HARDBEA-NEXT: vorr d18, d16, d16 +; HARDBEA-NEXT: vuzp.16 d17, d18 +; HARDBEA-NEXT: vext.16 d16, d18, d16, #2 +; HARDBEA-NEXT: vrev32.16 d16, d16 +; HARDBEA-NEXT: vadd.i32 d16, d16, d16 +; HARDBEA-NEXT: vrev64.32 d0, d16 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast6: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.16 d16, d16 +; SOFTFPBEA-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBEA-NEXT: vorr d17, d16, d16 +; SOFTFPBEA-NEXT: vorr d18, d16, d16 +; SOFTFPBEA-NEXT: vuzp.16 d17, d18 +; SOFTFPBEA-NEXT: vext.16 d16, d18, d16, #2 +; SOFTFPBEA-NEXT: vrev32.16 d16, d16 +; SOFTFPBEA-NEXT: vadd.i32 d16, d16, d16 +; SOFTFPBEA-NEXT: vrev64.32 d16, d16 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: bx lr +entry: + %b = fadd <4 x half> %a, %a + %c = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> + %d = bitcast <4 x half> %c to <2 x i32> + %e = add <2 x i32> %d, %d + %r = shufflevector <2 x i32> %e, <2 x i32> %e, <2 x i32> + ret <2 x i32> %r +} + +define <2 x float> @cast7(<4 x half> %a) { +; HARD-LABEL: cast7: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.f16 d16, d16, d16 +; HARD-NEXT: vorr d17, d16, d16 +; HARD-NEXT: vorr d18, d16, d16 +; HARD-NEXT: vuzp.16 d17, d18 +; HARD-NEXT: vext.16 d16, d18, d16, #2 +; HARD-NEXT: vadd.f32 d16, d16, d16 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast7: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.f16 d16, d16, d16 +; SOFT-NEXT: vorr d17, d16, d16 +; SOFT-NEXT: vorr d18, d16, d16 +; SOFT-NEXT: vuzp.16 d17, d18 +; SOFT-NEXT: vext.16 d16, d18, d16, #2 +; SOFT-NEXT: vadd.f32 d16, d16, d16 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast7: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.f16 d16, d0, d0 +; HARDA-NEXT: vorr d17, d16, d16 +; HARDA-NEXT: vorr d18, d16, d16 +; HARDA-NEXT: vuzp.16 d17, d18 +; HARDA-NEXT: vext.16 d16, d18, d16, #2 +; HARDA-NEXT: vadd.f32 d0, d16, d16 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast7: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.f16 d16, d16, d16 +; SOFTA-NEXT: vorr d17, d16, d16 +; SOFTA-NEXT: vorr d18, d16, d16 +; SOFTA-NEXT: vuzp.16 d17, d18 +; SOFTA-NEXT: vext.16 d16, d18, d16, #2 +; SOFTA-NEXT: vadd.f32 d16, d16, d16 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast7: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.16 d16, d16 +; HARDBE-NEXT: vadd.f16 d16, d16, d16 +; HARDBE-NEXT: vorr d17, d16, d16 +; HARDBE-NEXT: vorr d18, d16, d16 +; HARDBE-NEXT: vuzp.16 d17, d18 +; HARDBE-NEXT: vext.16 d16, d18, d16, #2 +; HARDBE-NEXT: vrev32.16 d16, d16 +; HARDBE-NEXT: vadd.f32 d16, d16, d16 +; HARDBE-NEXT: vrev64.32 d16, d16 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast7: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.16 d16, d16 +; SOFTFPBE-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBE-NEXT: vorr d17, d16, d16 +; SOFTFPBE-NEXT: vorr d18, d16, d16 +; SOFTFPBE-NEXT: vuzp.16 d17, d18 +; SOFTFPBE-NEXT: vext.16 d16, d18, d16, #2 +; SOFTFPBE-NEXT: vrev32.16 d16, d16 +; SOFTFPBE-NEXT: vadd.f32 d16, d16, d16 +; SOFTFPBE-NEXT: vrev64.32 d16, d16 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast7: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.16 d16, d0 +; HARDBEA-NEXT: vadd.f16 d16, d16, d16 +; HARDBEA-NEXT: vorr d17, d16, d16 +; HARDBEA-NEXT: vorr d18, d16, d16 +; HARDBEA-NEXT: vuzp.16 d17, d18 +; HARDBEA-NEXT: vext.16 d16, d18, d16, #2 +; HARDBEA-NEXT: vrev32.16 d16, d16 +; HARDBEA-NEXT: vadd.f32 d16, d16, d16 +; HARDBEA-NEXT: vrev64.32 d0, d16 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast7: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.16 d16, d16 +; SOFTFPBEA-NEXT: vadd.f16 d16, d16, d16 +; SOFTFPBEA-NEXT: vorr d17, d16, d16 +; SOFTFPBEA-NEXT: vorr d18, d16, d16 +; SOFTFPBEA-NEXT: vuzp.16 d17, d18 +; SOFTFPBEA-NEXT: vext.16 d16, d18, d16, #2 +; SOFTFPBEA-NEXT: vrev32.16 d16, d16 +; SOFTFPBEA-NEXT: vadd.f32 d16, d16, d16 +; SOFTFPBEA-NEXT: vrev64.32 d16, d16 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: bx lr +entry: + %b = fadd <4 x half> %a, %a + %c = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> + %d = bitcast <4 x half> %c to <2 x float> + %e = fadd <2 x float> %d, %d + %r = shufflevector <2 x float> %e, <2 x float> %e, <2 x i32> + ret <2 x float> %r +} + +define <8 x half> @cast8(<2 x i64> %a) { +; HARD-LABEL: cast8: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d17, r2, r3 +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.i8 q8, q8, q8 +; HARD-NEXT: vadd.f16 q8, q8, q8 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast8: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d17, r2, r3 +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.i8 q8, q8, q8 +; SOFT-NEXT: vadd.f16 q8, q8, q8 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast8: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.i8 q8, q0, q0 +; HARDA-NEXT: vadd.f16 q0, q8, q8 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast8: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d17, r2, r3 +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.i8 q8, q8, q8 +; SOFTA-NEXT: vadd.f16 q8, q8, q8 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: vmov r2, r3, d17 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast8: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d17, r3, r2 +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.8 q8, q8 +; HARDBE-NEXT: vadd.i8 q8, q8, q8 +; HARDBE-NEXT: vrev16.8 q8, q8 +; HARDBE-NEXT: vadd.f16 q8, q8, q8 +; HARDBE-NEXT: vrev64.16 q8, q8 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: vmov r3, r2, d17 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast8: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d17, r3, r2 +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.8 q8, q8 +; SOFTFPBE-NEXT: vadd.i8 q8, q8, q8 +; SOFTFPBE-NEXT: vrev16.8 q8, q8 +; SOFTFPBE-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBE-NEXT: vrev64.16 q8, q8 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: vmov r3, r2, d17 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast8: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.8 q8, q0 +; HARDBEA-NEXT: vadd.i8 q8, q8, q8 +; HARDBEA-NEXT: vrev16.8 q8, q8 +; HARDBEA-NEXT: vadd.f16 q8, q8, q8 +; HARDBEA-NEXT: vrev64.16 q0, q8 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast8: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d17, r3, r2 +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.8 q8, q8 +; SOFTFPBEA-NEXT: vadd.i8 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev16.8 q8, q8 +; SOFTFPBEA-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev64.16 q8, q8 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: vmov r3, r2, d17 +; SOFTFPBEA-NEXT: bx lr +entry: + %v1 = bitcast <2 x i64> %a to <16 x i8> + %add1 = add <16 x i8> %v1, %v1 + %v4f16 = bitcast <16 x i8> %add1 to <8 x half> + %add2 = fadd <8 x half> %v4f16, %v4f16 + ret <8 x half> %add2 +} + +define <8 x half> @cast9(<2 x i64> %a) { +; HARD-LABEL: cast9: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d17, r2, r3 +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.i16 q8, q8, q8 +; HARD-NEXT: vadd.f16 q8, q8, q8 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast9: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d17, r2, r3 +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.i16 q8, q8, q8 +; SOFT-NEXT: vadd.f16 q8, q8, q8 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast9: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.i16 q8, q0, q0 +; HARDA-NEXT: vadd.f16 q0, q8, q8 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast9: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d17, r2, r3 +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.i16 q8, q8, q8 +; SOFTA-NEXT: vadd.f16 q8, q8, q8 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: vmov r2, r3, d17 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast9: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d17, r3, r2 +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.16 q8, q8 +; HARDBE-NEXT: vadd.i16 q8, q8, q8 +; HARDBE-NEXT: vadd.f16 q8, q8, q8 +; HARDBE-NEXT: vrev64.16 q8, q8 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: vmov r3, r2, d17 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast9: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d17, r3, r2 +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.16 q8, q8 +; SOFTFPBE-NEXT: vadd.i16 q8, q8, q8 +; SOFTFPBE-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBE-NEXT: vrev64.16 q8, q8 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: vmov r3, r2, d17 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast9: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.16 q8, q0 +; HARDBEA-NEXT: vadd.i16 q8, q8, q8 +; HARDBEA-NEXT: vadd.f16 q8, q8, q8 +; HARDBEA-NEXT: vrev64.16 q0, q8 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast9: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d17, r3, r2 +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.16 q8, q8 +; SOFTFPBEA-NEXT: vadd.i16 q8, q8, q8 +; SOFTFPBEA-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev64.16 q8, q8 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: vmov r3, r2, d17 +; SOFTFPBEA-NEXT: bx lr +entry: + %v1 = bitcast <2 x i64> %a to <8 x i16> + %add1 = add <8 x i16> %v1, %v1 + %v4f16 = bitcast <8 x i16> %add1 to <8 x half> + %add2 = fadd <8 x half> %v4f16, %v4f16 + ret <8 x half> %add2 +} + +define <8 x half> @cast10(<2 x i64> %a) { +; HARD-LABEL: cast10: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d17, r2, r3 +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.f32 q8, q8, q8 +; HARD-NEXT: vadd.f16 q8, q8, q8 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast10: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d17, r2, r3 +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.f32 q8, q8, q8 +; SOFT-NEXT: vadd.f16 q8, q8, q8 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast10: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.f32 q8, q0, q0 +; HARDA-NEXT: vadd.f16 q0, q8, q8 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast10: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d17, r2, r3 +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.f32 q8, q8, q8 +; SOFTA-NEXT: vadd.f16 q8, q8, q8 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: vmov r2, r3, d17 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast10: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d17, r3, r2 +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.32 q8, q8 +; HARDBE-NEXT: vadd.f32 q8, q8, q8 +; HARDBE-NEXT: vrev32.16 q8, q8 +; HARDBE-NEXT: vadd.f16 q8, q8, q8 +; HARDBE-NEXT: vrev64.16 q8, q8 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: vmov r3, r2, d17 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast10: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d17, r3, r2 +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.32 q8, q8 +; SOFTFPBE-NEXT: vadd.f32 q8, q8, q8 +; SOFTFPBE-NEXT: vrev32.16 q8, q8 +; SOFTFPBE-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBE-NEXT: vrev64.16 q8, q8 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: vmov r3, r2, d17 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast10: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.32 q8, q0 +; HARDBEA-NEXT: vadd.f32 q8, q8, q8 +; HARDBEA-NEXT: vrev32.16 q8, q8 +; HARDBEA-NEXT: vadd.f16 q8, q8, q8 +; HARDBEA-NEXT: vrev64.16 q0, q8 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast10: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d17, r3, r2 +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.32 q8, q8 +; SOFTFPBEA-NEXT: vadd.f32 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev32.16 q8, q8 +; SOFTFPBEA-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev64.16 q8, q8 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: vmov r3, r2, d17 +; SOFTFPBEA-NEXT: bx lr +entry: + %v1 = bitcast <2 x i64> %a to <4 x float> + %add1 = fadd <4 x float> %v1, %v1 + %v4f16 = bitcast <4 x float> %add1 to <8 x half> + %add2 = fadd <8 x half> %v4f16, %v4f16 + ret <8 x half> %add2 +} + +define <8 x half> @cast11(<2 x i64> %a) { +; HARD-LABEL: cast11: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d17, r2, r3 +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.i32 q8, q8, q8 +; HARD-NEXT: vadd.f16 q8, q8, q8 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast11: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d17, r2, r3 +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.i32 q8, q8, q8 +; SOFT-NEXT: vadd.f16 q8, q8, q8 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast11: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.i32 q8, q0, q0 +; HARDA-NEXT: vadd.f16 q0, q8, q8 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast11: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d17, r2, r3 +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.i32 q8, q8, q8 +; SOFTA-NEXT: vadd.f16 q8, q8, q8 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: vmov r2, r3, d17 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast11: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d17, r3, r2 +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.32 q8, q8 +; HARDBE-NEXT: vadd.i32 q8, q8, q8 +; HARDBE-NEXT: vrev32.16 q8, q8 +; HARDBE-NEXT: vadd.f16 q8, q8, q8 +; HARDBE-NEXT: vrev64.16 q8, q8 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: vmov r3, r2, d17 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast11: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d17, r3, r2 +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.32 q8, q8 +; SOFTFPBE-NEXT: vadd.i32 q8, q8, q8 +; SOFTFPBE-NEXT: vrev32.16 q8, q8 +; SOFTFPBE-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBE-NEXT: vrev64.16 q8, q8 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: vmov r3, r2, d17 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast11: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.32 q8, q0 +; HARDBEA-NEXT: vadd.i32 q8, q8, q8 +; HARDBEA-NEXT: vrev32.16 q8, q8 +; HARDBEA-NEXT: vadd.f16 q8, q8, q8 +; HARDBEA-NEXT: vrev64.16 q0, q8 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast11: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d17, r3, r2 +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.32 q8, q8 +; SOFTFPBEA-NEXT: vadd.i32 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev32.16 q8, q8 +; SOFTFPBEA-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev64.16 q8, q8 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: vmov r3, r2, d17 +; SOFTFPBEA-NEXT: bx lr +entry: + %v1 = bitcast <2 x i64> %a to <4 x i32> + %add1 = add <4 x i32> %v1, %v1 + %v4f16 = bitcast <4 x i32> %add1 to <8 x half> + %add2 = fadd <8 x half> %v4f16, %v4f16 + ret <8 x half> %add2 +} + +define <16 x i8> @cast12( <8 x half> %a) { +; HARD-LABEL: cast12: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d17, r2, r3 +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.f16 q8, q8, q8 +; HARD-NEXT: vadd.i8 q8, q8, q8 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast12: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d17, r2, r3 +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.f16 q8, q8, q8 +; SOFT-NEXT: vadd.i8 q8, q8, q8 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast12: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.f16 q8, q0, q0 +; HARDA-NEXT: vadd.i8 q0, q8, q8 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast12: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d17, r2, r3 +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.f16 q8, q8, q8 +; SOFTA-NEXT: vadd.i8 q8, q8, q8 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: vmov r2, r3, d17 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast12: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d17, r3, r2 +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.16 q8, q8 +; HARDBE-NEXT: vadd.f16 q8, q8, q8 +; HARDBE-NEXT: vrev16.8 q8, q8 +; HARDBE-NEXT: vadd.i8 q8, q8, q8 +; HARDBE-NEXT: vrev64.8 q8, q8 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: vmov r3, r2, d17 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast12: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d17, r3, r2 +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.16 q8, q8 +; SOFTFPBE-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBE-NEXT: vrev16.8 q8, q8 +; SOFTFPBE-NEXT: vadd.i8 q8, q8, q8 +; SOFTFPBE-NEXT: vrev64.8 q8, q8 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: vmov r3, r2, d17 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast12: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.16 q8, q0 +; HARDBEA-NEXT: vadd.f16 q8, q8, q8 +; HARDBEA-NEXT: vrev16.8 q8, q8 +; HARDBEA-NEXT: vadd.i8 q8, q8, q8 +; HARDBEA-NEXT: vrev64.8 q0, q8 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast12: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d17, r3, r2 +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.16 q8, q8 +; SOFTFPBEA-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev16.8 q8, q8 +; SOFTFPBEA-NEXT: vadd.i8 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev64.8 q8, q8 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: vmov r3, r2, d17 +; SOFTFPBEA-NEXT: bx lr +entry: + %b = fadd <8 x half> %a, %a + %d = bitcast <8 x half> %b to <16 x i8> + %e = add <16 x i8> %d, %d + ret <16 x i8> %e +} + +define <4 x i32> @cast13(<8 x half> %a) { +; HARD-LABEL: cast13: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d17, r2, r3 +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.f16 q8, q8, q8 +; HARD-NEXT: vadd.i32 q8, q8, q8 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast13: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d17, r2, r3 +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.f16 q8, q8, q8 +; SOFT-NEXT: vadd.i32 q8, q8, q8 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast13: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.f16 q8, q0, q0 +; HARDA-NEXT: vadd.i32 q0, q8, q8 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast13: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d17, r2, r3 +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.f16 q8, q8, q8 +; SOFTA-NEXT: vadd.i32 q8, q8, q8 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: vmov r2, r3, d17 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast13: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d17, r3, r2 +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.16 q8, q8 +; HARDBE-NEXT: vadd.f16 q8, q8, q8 +; HARDBE-NEXT: vrev32.16 q8, q8 +; HARDBE-NEXT: vadd.i32 q8, q8, q8 +; HARDBE-NEXT: vrev64.32 q8, q8 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: vmov r3, r2, d17 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast13: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d17, r3, r2 +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.16 q8, q8 +; SOFTFPBE-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBE-NEXT: vrev32.16 q8, q8 +; SOFTFPBE-NEXT: vadd.i32 q8, q8, q8 +; SOFTFPBE-NEXT: vrev64.32 q8, q8 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: vmov r3, r2, d17 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast13: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.16 q8, q0 +; HARDBEA-NEXT: vadd.f16 q8, q8, q8 +; HARDBEA-NEXT: vrev32.16 q8, q8 +; HARDBEA-NEXT: vadd.i32 q8, q8, q8 +; HARDBEA-NEXT: vrev64.32 q0, q8 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast13: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d17, r3, r2 +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.16 q8, q8 +; SOFTFPBEA-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev32.16 q8, q8 +; SOFTFPBEA-NEXT: vadd.i32 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev64.32 q8, q8 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: vmov r3, r2, d17 +; SOFTFPBEA-NEXT: bx lr +entry: + %b = fadd <8 x half> %a, %a + %d = bitcast <8 x half> %b to <4 x i32> + %e = add <4 x i32> %d, %d + ret <4 x i32> %e +} + +define <4 x float> @cast14(<8 x half> %a) { +; HARD-LABEL: cast14: +; HARD: @ %bb.0: @ %entry +; HARD-NEXT: vmov d17, r2, r3 +; HARD-NEXT: vmov d16, r0, r1 +; HARD-NEXT: vadd.f16 q8, q8, q8 +; HARD-NEXT: vadd.f32 q8, q8, q8 +; HARD-NEXT: vmov r0, r1, d16 +; HARD-NEXT: vmov r2, r3, d17 +; HARD-NEXT: bx lr +; +; SOFT-LABEL: cast14: +; SOFT: @ %bb.0: @ %entry +; SOFT-NEXT: vmov d17, r2, r3 +; SOFT-NEXT: vmov d16, r0, r1 +; SOFT-NEXT: vadd.f16 q8, q8, q8 +; SOFT-NEXT: vadd.f32 q8, q8, q8 +; SOFT-NEXT: vmov r0, r1, d16 +; SOFT-NEXT: vmov r2, r3, d17 +; SOFT-NEXT: bx lr +; +; HARDA-LABEL: cast14: +; HARDA: @ %bb.0: @ %entry +; HARDA-NEXT: vadd.f16 q8, q0, q0 +; HARDA-NEXT: vadd.f32 q0, q8, q8 +; HARDA-NEXT: bx lr +; +; SOFTA-LABEL: cast14: +; SOFTA: @ %bb.0: @ %entry +; SOFTA-NEXT: vmov d17, r2, r3 +; SOFTA-NEXT: vmov d16, r0, r1 +; SOFTA-NEXT: vadd.f16 q8, q8, q8 +; SOFTA-NEXT: vadd.f32 q8, q8, q8 +; SOFTA-NEXT: vmov r0, r1, d16 +; SOFTA-NEXT: vmov r2, r3, d17 +; SOFTA-NEXT: bx lr +; +; HARDBE-LABEL: cast14: +; HARDBE: @ %bb.0: @ %entry +; HARDBE-NEXT: vmov d17, r3, r2 +; HARDBE-NEXT: vmov d16, r1, r0 +; HARDBE-NEXT: vrev64.16 q8, q8 +; HARDBE-NEXT: vadd.f16 q8, q8, q8 +; HARDBE-NEXT: vrev32.16 q8, q8 +; HARDBE-NEXT: vadd.f32 q8, q8, q8 +; HARDBE-NEXT: vrev64.32 q8, q8 +; HARDBE-NEXT: vmov r1, r0, d16 +; HARDBE-NEXT: vmov r3, r2, d17 +; HARDBE-NEXT: bx lr +; +; SOFTFPBE-LABEL: cast14: +; SOFTFPBE: @ %bb.0: @ %entry +; SOFTFPBE-NEXT: vmov d17, r3, r2 +; SOFTFPBE-NEXT: vmov d16, r1, r0 +; SOFTFPBE-NEXT: vrev64.16 q8, q8 +; SOFTFPBE-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBE-NEXT: vrev32.16 q8, q8 +; SOFTFPBE-NEXT: vadd.f32 q8, q8, q8 +; SOFTFPBE-NEXT: vrev64.32 q8, q8 +; SOFTFPBE-NEXT: vmov r1, r0, d16 +; SOFTFPBE-NEXT: vmov r3, r2, d17 +; SOFTFPBE-NEXT: bx lr +; +; HARDBEA-LABEL: cast14: +; HARDBEA: @ %bb.0: @ %entry +; HARDBEA-NEXT: vrev64.16 q8, q0 +; HARDBEA-NEXT: vadd.f16 q8, q8, q8 +; HARDBEA-NEXT: vrev32.16 q8, q8 +; HARDBEA-NEXT: vadd.f32 q8, q8, q8 +; HARDBEA-NEXT: vrev64.32 q0, q8 +; HARDBEA-NEXT: bx lr +; +; SOFTFPBEA-LABEL: cast14: +; SOFTFPBEA: @ %bb.0: @ %entry +; SOFTFPBEA-NEXT: vmov d17, r3, r2 +; SOFTFPBEA-NEXT: vmov d16, r1, r0 +; SOFTFPBEA-NEXT: vrev64.16 q8, q8 +; SOFTFPBEA-NEXT: vadd.f16 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev32.16 q8, q8 +; SOFTFPBEA-NEXT: vadd.f32 q8, q8, q8 +; SOFTFPBEA-NEXT: vrev64.32 q8, q8 +; SOFTFPBEA-NEXT: vmov r1, r0, d16 +; SOFTFPBEA-NEXT: vmov r3, r2, d17 +; SOFTFPBEA-NEXT: bx lr +entry: + %b = fadd <8 x half> %a, %a + %d = bitcast <8 x half> %b to <4 x float> + %e = fadd <4 x float> %d, %d + ret <4 x float> %e +}