Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -245,7 +245,7 @@ const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; for (auto VT : IntTypes) { - addRegisterClass(VT, &ARM::QPRRegClass); + addRegisterClass(VT, &ARM::MQPRRegClass); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); @@ -289,7 +289,7 @@ const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; for (auto VT : FloatTypes) { - addRegisterClass(VT, &ARM::QPRRegClass); + addRegisterClass(VT, &ARM::MQPRRegClass); if (!HasMVEFP) setAllExpand(VT); @@ -338,7 +338,7 @@ // vector types is inhibited at integer-only level. const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; for (auto VT : LongTypes) { - addRegisterClass(VT, &ARM::QPRRegClass); + addRegisterClass(VT, &ARM::MQPRRegClass); setAllExpand(VT); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -5151,102 +5151,102 @@ // Bit convert patterns let Predicates = [HasMVEInt] in { - def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 MQPR:$src))), (v2i64 MQPR:$src)>; - def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 MQPR:$src))), (v4f32 MQPR:$src)>; - def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v8f16 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v8i16 MQPR:$src))), (v8f16 MQPR:$src)>; } let Predicates = [IsLE,HasMVEInt] in { - def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; - def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; - - def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; - def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; - - def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; - def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; - - def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; - def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; - - def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>; - def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>; - - def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; - def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; - - def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>; - def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 MQPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 MQPR:$src)>; + + def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 MQPR:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 MQPR:$src)>; + + def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 MQPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 MQPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 MQPR:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 MQPR:$src)>; + + def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 MQPR:$src)>; + def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 MQPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 MQPR:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 MQPR:$src)>; + + def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 MQPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 MQPR:$src)>; } let Predicates = [IsBE,HasMVEInt] in { - def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 (MVE_VREV64_32 QPR:$src))>; - def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 (MVE_VREV64_32 QPR:$src))>; - def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 (MVE_VREV64_16 QPR:$src))>; - def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 (MVE_VREV64_16 QPR:$src))>; - def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 (MVE_VREV64_8 QPR:$src))>; - - def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 (MVE_VREV64_32 QPR:$src))>; - def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 (MVE_VREV64_32 QPR:$src))>; - def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 (MVE_VREV64_16 QPR:$src))>; - def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 (MVE_VREV64_16 QPR:$src))>; - def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 (MVE_VREV64_8 QPR:$src))>; - - def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 (MVE_VREV64_32 QPR:$src))>; - def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 (MVE_VREV64_32 QPR:$src))>; - def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 (MVE_VREV32_16 QPR:$src))>; - def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 (MVE_VREV32_16 QPR:$src))>; - def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 (MVE_VREV32_8 QPR:$src))>; - - def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 (MVE_VREV64_32 QPR:$src))>; - def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 (MVE_VREV64_32 QPR:$src))>; - def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 (MVE_VREV32_16 QPR:$src))>; - def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 (MVE_VREV32_16 QPR:$src))>; - def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 (MVE_VREV32_8 QPR:$src))>; - - def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 (MVE_VREV64_16 QPR:$src))>; - def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 (MVE_VREV64_16 QPR:$src))>; - def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 (MVE_VREV32_16 QPR:$src))>; - def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 (MVE_VREV32_16 QPR:$src))>; - def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 (MVE_VREV16_8 QPR:$src))>; - - def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 (MVE_VREV64_16 QPR:$src))>; - def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 (MVE_VREV64_16 QPR:$src))>; - def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 (MVE_VREV32_16 QPR:$src))>; - def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 (MVE_VREV32_16 QPR:$src))>; - def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 (MVE_VREV16_8 QPR:$src))>; - - def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 (MVE_VREV64_8 QPR:$src))>; - def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 (MVE_VREV64_8 QPR:$src))>; - def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 (MVE_VREV32_8 QPR:$src))>; - def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 (MVE_VREV32_8 QPR:$src))>; - def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 (MVE_VREV16_8 QPR:$src))>; - def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 (MVE_VREV16_8 QPR:$src))>; + def : Pat<(v2f64 (bitconvert (v4f32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v4i32 MQPR:$src))), (v2f64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v8f16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v8i16 MQPR:$src))), (v2f64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2f64 (bitconvert (v16i8 MQPR:$src))), (v2f64 (MVE_VREV64_8 MQPR:$src))>; + + def : Pat<(v2i64 (bitconvert (v4f32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v4i32 MQPR:$src))), (v2i64 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v8f16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v8i16 MQPR:$src))), (v2i64 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v2i64 (bitconvert (v16i8 MQPR:$src))), (v2i64 (MVE_VREV64_8 MQPR:$src))>; + + def : Pat<(v4f32 (bitconvert (v2f64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v2i64 MQPR:$src))), (v4f32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v8f16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v8i16 MQPR:$src))), (v4f32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4f32 (bitconvert (v16i8 MQPR:$src))), (v4f32 (MVE_VREV32_8 MQPR:$src))>; + + def : Pat<(v4i32 (bitconvert (v2f64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v2i64 MQPR:$src))), (v4i32 (MVE_VREV64_32 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v8f16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v8i16 MQPR:$src))), (v4i32 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v4i32 (bitconvert (v16i8 MQPR:$src))), (v4i32 (MVE_VREV32_8 MQPR:$src))>; + + def : Pat<(v8f16 (bitconvert (v2f64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v2i64 MQPR:$src))), (v8f16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v4f32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v4i32 MQPR:$src))), (v8f16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8f16 (bitconvert (v16i8 MQPR:$src))), (v8f16 (MVE_VREV16_8 MQPR:$src))>; + + def : Pat<(v8i16 (bitconvert (v2f64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v2i64 MQPR:$src))), (v8i16 (MVE_VREV64_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v4f32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v4i32 MQPR:$src))), (v8i16 (MVE_VREV32_16 MQPR:$src))>; + def : Pat<(v8i16 (bitconvert (v16i8 MQPR:$src))), (v8i16 (MVE_VREV16_8 MQPR:$src))>; + + def : Pat<(v16i8 (bitconvert (v2f64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v2i64 MQPR:$src))), (v16i8 (MVE_VREV64_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v4f32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v4i32 MQPR:$src))), (v16i8 (MVE_VREV32_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v8f16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>; + def : Pat<(v16i8 (bitconvert (v8i16 MQPR:$src))), (v16i8 (MVE_VREV16_8 MQPR:$src))>; } Index: llvm/test/CodeGen/Thumb2/mve-crash-qpr.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-crash-qpr.ll @@ -0,0 +1,324 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s + +@.str = private unnamed_addr constant [1 x i8] zeroinitializer, align 1 + +; Function Attrs: nounwind optsize +define hidden arm_aapcs_vfpcc i32 @g() local_unnamed_addr #1 { +; CHECK-LABEL: g: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #112 +; CHECK-NEXT: sub sp, #112 +; CHECK-NEXT: adr.w r8, .LCPI0_0 +; CHECK-NEXT: adr.w r9, .LCPI0_1 +; CHECK-NEXT: mov.w lr, #16 +; CHECK-NEXT: vldrw.u32 q0, [r9] +; CHECK-NEXT: vldrw.u32 q1, [r8] +; CHECK-NEXT: vmov.i32 q2, #0x1 +; CHECK-NEXT: add.w r1, sp, #98 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.i16 q2, #0x6 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov.i8 q3, #0x0 +; CHECK-NEXT: vmov.i8 q4, #0xff +; CHECK-NEXT: vmov.i32 q6, #0x8 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: adds r3, r1, r0 +; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vand q5, q0, q2 +; CHECK-NEXT: vadd.i32 q0, q0, q6 +; CHECK-NEXT: vcmp.i32 eq, q5, zr +; CHECK-NEXT: vand q5, q1, q2 +; CHECK-NEXT: vpsel q7, q4, q3 +; CHECK-NEXT: vcmp.i32 eq, q5, zr +; CHECK-NEXT: vpsel q2, q4, q3 +; CHECK-NEXT: vadd.i32 q1, q1, q6 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov.16 q5[2], r2 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov r2, s28 +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov r2, s29 +; CHECK-NEXT: vmov.16 q5[5], r2 +; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov.16 q5[6], r2 +; CHECK-NEXT: vmov r2, s31 +; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vpt.i16 ne, q5, zr +; CHECK-NEXT: vstrht.16 q2, [r3] +; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond2.preheader +; CHECK-NEXT: movs r0, #7 +; CHECK-NEXT: bl h +; CHECK-NEXT: movw r6, :lower16:.L.str +; CHECK-NEXT: movt r6, :upper16:.L.str +; CHECK-NEXT: cbz r0, .LBB0_6 +; CHECK-NEXT: @ %bb.3: @ %for.body4.lr.ph +; CHECK-NEXT: ldrsh.w r0, [sp, #98] +; CHECK-NEXT: movw r5, :lower16:.L.str +; CHECK-NEXT: movs r7, #128 +; CHECK-NEXT: movt r5, :upper16:.L.str +; CHECK-NEXT: lsls r4, r0, #1 +; CHECK-NEXT: .LBB0_4: @ %for.body4 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: movs r2, #5 +; CHECK-NEXT: bl h +; CHECK-NEXT: movs r0, #7 +; CHECK-NEXT: adds r7, #2 +; CHECK-NEXT: bl h +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: bne .LBB0_4 +; CHECK-NEXT: @ %bb.5: @ %for.cond12.preheader +; CHECK-NEXT: movw r0, #65534 +; CHECK-NEXT: movt r0, #511 +; CHECK-NEXT: tst r7, r0 +; CHECK-NEXT: bne .LBB0_7 +; CHECK-NEXT: b .LBB0_10 +; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: movs r7, #128 +; CHECK-NEXT: .LBB0_7: @ %for.body14.lr.ph +; CHECK-NEXT: mvn r0, #-33554432 +; CHECK-NEXT: add.w r4, sp, #98 +; CHECK-NEXT: bics r0, r7 +; CHECK-NEXT: adds r5, r7, r0 +; CHECK-NEXT: adds r7, #1 +; CHECK-NEXT: .LBB0_8: @ %for.body14 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: bl h +; CHECK-NEXT: adds r0, r7, #1 +; CHECK-NEXT: lsls r1, r7, #7 +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: bne .LBB0_8 +; CHECK-NEXT: @ %bb.9: @ %for.end18.loopexit +; CHECK-NEXT: adds r7, r5, #1 +; CHECK-NEXT: .LBB0_10: @ %for.end18 +; CHECK-NEXT: sxth r0, r7 +; CHECK-NEXT: movs r1, #9 +; CHECK-NEXT: subs r0, r0, r6 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: bl h +; CHECK-NEXT: add.w r4, sp, #82 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: bl h +; CHECK-NEXT: mov.w lr, #32 +; CHECK-NEXT: vldrw.u32 q0, [r9] +; CHECK-NEXT: vldrw.u32 q1, [r8] +; CHECK-NEXT: vmov.i32 q2, #0x1 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.i16 q2, #0x6 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov.i8 q3, #0x0 +; CHECK-NEXT: vmov.i8 q4, #0xff +; CHECK-NEXT: vmov.i32 q6, #0x8 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: .LBB0_11: @ %vector.body92 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: adds r2, r4, r0 +; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vand q5, q0, q2 +; CHECK-NEXT: vadd.i32 q0, q0, q6 +; CHECK-NEXT: vcmp.i32 eq, q5, zr +; CHECK-NEXT: vand q5, q1, q2 +; CHECK-NEXT: vpsel q7, q4, q3 +; CHECK-NEXT: vcmp.i32 eq, q5, zr +; CHECK-NEXT: vpsel q2, q4, q3 +; CHECK-NEXT: vadd.i32 q1, q1, q6 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.16 q5[0], r1 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov.16 q5[1], r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov.16 q5[2], r1 +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov.16 q5[3], r1 +; CHECK-NEXT: vmov r1, s28 +; CHECK-NEXT: vmov.16 q5[4], r1 +; CHECK-NEXT: vmov r1, s29 +; CHECK-NEXT: vmov.16 q5[5], r1 +; CHECK-NEXT: vmov r1, s30 +; CHECK-NEXT: vmov.16 q5[6], r1 +; CHECK-NEXT: vmov r1, s31 +; CHECK-NEXT: vmov.16 q5[7], r1 +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vpt.i16 ne, q5, zr +; CHECK-NEXT: vstrht.16 q2, [r2] +; CHECK-NEXT: le lr, .LBB0_11 +; CHECK-NEXT: @ %bb.12: @ %for.end41 +; CHECK-NEXT: vldrw.u32 q0, [r8] +; CHECK-NEXT: add r1, sp, #44 +; CHECK-NEXT: mov.w lr, #128 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov.i32 q1, #0x6 +; CHECK-NEXT: vmov.i32 q2, #0x4 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB0_13: @ %vector.body102 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r2, r1, r0 +; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vstrwt.32 q1, [r2] +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: le lr, .LBB0_13 +; CHECK-NEXT: @ %bb.14: @ %for.end53 +; CHECK-NEXT: add r0, sp, #44 +; CHECK-NEXT: bl h +; CHECK-NEXT: add sp, #112 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.15: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 7 @ 0x7 +entry: + %d = alloca [7 x i16], align 2 + %d24 = alloca [8 x i16], align 2 + %k = alloca [9 x i32], align 4 + %0 = bitcast [7 x i16]* %d to i8* + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i32> [ , %entry ], [ %vec.ind.next, %vector.body ] + %1 = and <8 x i32> %vec.ind, + %2 = icmp eq <8 x i32> %1, zeroinitializer + %3 = getelementptr inbounds [7 x i16], [7 x i16]* %d, i32 0, i32 %index + %4 = bitcast i16* %3 to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> , <8 x i16>* %4, i32 2, <8 x i1> %2) + %index.next = add i32 %index, 8 + %vec.ind.next = add <8 x i32> %vec.ind, + %5 = icmp eq i32 %index.next, 128 + br i1 %5, label %for.cond2.preheader, label %vector.body + +for.cond2.preheader: ; preds = %vector.body + %call379 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @h to i32 (i32)*)(i32 7) #5 + %tobool80 = icmp eq i32 %call379, 0 + br i1 %tobool80, label %for.body14.lr.ph, label %for.body4.lr.ph + +for.body4.lr.ph: ; preds = %for.cond2.preheader + %arrayidx5 = getelementptr inbounds [7 x i16], [7 x i16]* %d, i32 0, i32 0 + %6 = load i16, i16* %arrayidx5, align 2 + %conv = sext i16 %6 to i32 + %add = shl nsw i32 %conv, 1 + br label %for.body4 + +for.cond12.preheader: ; preds = %for.body4 + %shl.mask76 = and i32 %add10, 33554430 + %tobool1377 = icmp eq i32 %shl.mask76, 0 + br i1 %tobool1377, label %for.end18, label %for.body14.lr.ph + +for.body14.lr.ph: ; preds = %for.cond2.preheader, %for.cond12.preheader + %e.1.lcssa91 = phi i32 [ %add10, %for.cond12.preheader ], [ 128, %for.cond2.preheader ] + %arraydecay = getelementptr inbounds [7 x i16], [7 x i16]* %d, i32 0, i32 0 + %7 = and i32 %e.1.lcssa91, 33554431 + %8 = xor i32 %7, 33554431 + %9 = add i32 %e.1.lcssa91, %8 + br label %for.body14 + +for.body4: ; preds = %for.body4.lr.ph, %for.body4 + %e.181 = phi i32 [ 128, %for.body4.lr.ph ], [ %add10, %for.body4 ] + %call8 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @h to i32 (i32, i8*, i32)*)(i32 %add, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i32 0, i32 0), i32 5) #5 + %add10 = add nuw nsw i32 %e.181, 2 + %call3 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @h to i32 (i32)*)(i32 7) #5 + %tobool = icmp eq i32 %call3, 0 + br i1 %tobool, label %for.cond12.preheader, label %for.body4 + +for.body14: ; preds = %for.body14.lr.ph, %for.body14 + %e.278 = phi i32 [ %e.1.lcssa91, %for.body14.lr.ph ], [ %inc17, %for.body14 ] + %call15 = call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @h to i32 (i16*)*)(i16* nonnull %arraydecay) #5 + %inc17 = add i32 %e.278, 1 + %lftr.wideiv = trunc i32 %inc17 to i25 + %exitcond86 = icmp eq i25 %lftr.wideiv, 0 + br i1 %exitcond86, label %for.end18.loopexit, label %for.body14 + +for.end18.loopexit: ; preds = %for.body14 + %10 = add i32 %9, 1 + br label %for.end18 + +for.end18: ; preds = %for.end18.loopexit, %for.cond12.preheader + %e.2.lcssa = phi i32 [ %add10, %for.cond12.preheader ], [ %10, %for.end18.loopexit ] + %sext = shl i32 %e.2.lcssa, 16 + %conv20 = ashr exact i32 %sext, 16 + %11 = inttoptr i32 %conv20 to i8* + %cmp21 = icmp eq i8* %11, getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i32 0, i32 0) + %conv22 = zext i1 %cmp21 to i32 + %call23 = call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @h to i32 (i32, i32)*)(i32 %conv22, i32 9) #5 + %12 = bitcast [8 x i16]* %d24 to i8* + %arraydecay26 = getelementptr inbounds [8 x i16], [8 x i16]* %d24, i32 0, i32 0 + %call27 = call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @h to i32 (i16*, i32)*)(i16* nonnull %arraydecay26, i32 2) #5 + br label %vector.body92 + +vector.body92: ; preds = %vector.body92, %for.end18 + %index96 = phi i32 [ 0, %for.end18 ], [ %index.next97, %vector.body92 ] + %vec.ind100 = phi <8 x i32> [ , %for.end18 ], [ %vec.ind.next101, %vector.body92 ] + %13 = and <8 x i32> %vec.ind100, + %14 = icmp eq <8 x i32> %13, zeroinitializer + %15 = getelementptr inbounds [8 x i16], [8 x i16]* %d24, i32 0, i32 %index96 + %16 = bitcast i16* %15 to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> , <8 x i16>* %16, i32 2, <8 x i1> %14) + %index.next97 = add i32 %index96, 8 + %vec.ind.next101 = add <8 x i32> %vec.ind100, + %17 = icmp eq i32 %index.next97, 256 + br i1 %17, label %for.end41, label %vector.body92 + +for.end41: ; preds = %vector.body92 + %18 = bitcast [9 x i32]* %k to i8* + br label %vector.body102 + +vector.body102: ; preds = %vector.body102, %for.end41 + %index106 = phi i32 [ 0, %for.end41 ], [ %index.next107, %vector.body102 ] + %vec.ind110 = phi <4 x i32> [ , %for.end41 ], [ %vec.ind.next111, %vector.body102 ] + %19 = icmp ne <4 x i32> %vec.ind110, zeroinitializer + %20 = getelementptr inbounds [9 x i32], [9 x i32]* %k, i32 0, i32 %index106 + %21 = bitcast i32* %20 to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> , <4 x i32>* %21, i32 4, <4 x i1> %19) + %index.next107 = add i32 %index106, 4 + %vec.ind.next111 = add <4 x i32> %vec.ind110, + %22 = icmp eq i32 %index.next107, 512 + br i1 %22, label %for.end53, label %vector.body102 + +for.end53: ; preds = %vector.body102 + %arraydecay54 = getelementptr inbounds [9 x i32], [9 x i32]* %k, i32 0, i32 0 + %call55 = call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @h to i32 (i32*)*)(i32* nonnull %arraydecay54) #5 + ret i32 undef +} + +; Function Attrs: optsize +declare dso_local arm_aapcs_vfpcc i32 @h(...) + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2