Index: lib/Target/ARM/ARMFastISel.cpp =================================================================== --- lib/Target/ARM/ARMFastISel.cpp +++ lib/Target/ARM/ARMFastISel.cpp @@ -189,6 +189,8 @@ unsigned ARMSelectCallOp(bool UseReg); unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT); + const TargetLowering *getTargetLowering() { return TM.getTargetLowering(); } + // Call handling routines. private: CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -3854,8 +3854,14 @@ // Turn f64->i64 into VMOVRRD. if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { - SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, - DAG.getVTList(MVT::i32, MVT::i32), Op); + SDValue Cvt; + if (TLI.isBigEndian() && SrcVT.isVector()) + Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), + DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); + else + Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, + DAG.getVTList(MVT::i32, MVT::i32), Op); // Merge the pieces into a single i64 value. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); } Index: lib/Target/ARM/ARMInstrNEON.td =================================================================== --- lib/Target/ARM/ARMInstrNEON.td +++ lib/Target/ARM/ARMInstrNEON.td @@ -2366,9 +2366,9 @@ def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr), (VST1q64 addrmode6:$addr, QPR:$value)>; def : Pat<(v2f64 (word_alignedload addrmode6:$addr)), - (VLD1q32 addrmode6:$addr)>; + (VLD1q32 addrmode6:$addr)>, Requires<[IsLE]>; def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr), - (VST1q32 addrmode6:$addr, QPR:$value)>; + (VST1q32 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>; def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)), (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>; def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr), @@ -6176,67 +6176,145 @@ //===----------------------------------------------------------------------===// // bit_convert -def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>; -def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>; -def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>; +} def : Pat<(v1i64 (bitconvert (f64 DPR:$src))), (v1i64 DPR:$src)>; -def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; -def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>; -def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>; -def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>; -def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; +} def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; -def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>; -def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>; -def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>; -def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; -def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; -def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>; -def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>; -def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>; -def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; -def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; +} def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; -def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; -def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; -def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; -def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; -def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; -def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; +} def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>; -def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; -def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; +} -def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; -def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; -def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; +} def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; -def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; -def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; -def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; -def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; -def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; +} def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; -def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; -def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; -def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; -def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; -def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; -def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; -def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; -def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; -def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; -def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; -def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; +} def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; -def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; -def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; -def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; +} def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; -def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; -def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; -def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; -def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; +let Predicates = [IsLE] in { + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; +} + +let Predicates = [IsBE] in { + // 64 bit conversions + def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v1i64 DPR:$src))), (VREV64d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (VREV32d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (VREV16d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>; + def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; + def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; + + // 128 bit conversions + def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; +} // Fold extracting an element out of a v2i32 into a vfp register. def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))), Index: test/CodeGen/ARM/big-endian-neon-bitconv.ll =================================================================== --- test/CodeGen/ARM/big-endian-neon-bitconv.ll +++ test/CodeGen/ARM/big-endian-neon-bitconv.ll @@ -0,0 +1,355 @@ +; RUN: llc < %s -march armeb -mattr v7,neon -o - | FileCheck %s + +@v2i64 = global <2 x i64> zeroinitializer +@v2i32 = global <2 x i32> zeroinitializer +@v4i32 = global <4 x i32> zeroinitializer +@v4i16 = global <4 x i16> zeroinitializer +@v8i16 = global <8 x i16> zeroinitializer +@v8i8 = global <8 x i8> zeroinitializer +@v16i8 = global <16 x i8> zeroinitializer + +@v2f32 = global <2 x float> zeroinitializer +@v2f64 = global <2 x double> zeroinitializer +@v4f32 = global <4 x float> zeroinitializer + + +; 64 bit conversions +define void @conv_i64_to_v8i8( i64 %val, <8 x i8>* %store ) { +; CHECK-LABEL: conv_i64_to_v8i8: +; CHECK: vrev64.8 + %v = bitcast i64 %val to <8 x i8> + %w = load <8 x i8>* @v8i8 + %a = add <8 x i8> %v, %w + store <8 x i8> %a, <8 x i8>* %store + ret void +} + +define void @conv_v8i8_to_i64( <8 x i8>* %load, <8 x i8>* %store ) { +; CHECK-LABEL: conv_v8i8_to_i64: +; CHECK: vrev64.8 + %v = load <8 x i8>* %load + %w = load <8 x i8>* @v8i8 + %a = add <8 x i8> %v, %w + %f = bitcast <8 x i8> %a to i64 + call void @conv_i64_to_v8i8( i64 %f, <8 x i8>* %store ) + ret void +} + +define void @conv_i64_to_v4i16( i64 %val, <4 x i16>* %store ) { +; CHECK-LABEL: conv_i64_to_v4i16: +; CHECK: vrev64.16 + %v = bitcast i64 %val to <4 x i16> + %w = load <4 x i16>* @v4i16 + %a = add <4 x i16> %v, %w + store <4 x i16> %a, <4 x i16>* %store + ret void +} + +define void @conv_v4i16_to_i64( <4 x i16>* %load, <4 x i16>* %store ) { +; CHECK-LABEL: conv_v4i16_to_i64: +; CHECK: vrev64.16 + %v = load <4 x i16>* %load + %w = load <4 x i16>* @v4i16 + %a = add <4 x i16> %v, %w + %f = bitcast <4 x i16> %a to i64 + call void @conv_i64_to_v4i16( i64 %f, <4 x i16>* %store ) + ret void +} + +define void @conv_i64_to_v2i32( i64 %val, <2 x i32>* %store ) { +; CHECK-LABEL: conv_i64_to_v2i32: +; CHECK: vrev64.32 + %v = bitcast i64 %val to <2 x i32> + %w = load <2 x i32>* @v2i32 + %a = add <2 x i32> %v, %w + store <2 x i32> %a, <2 x i32>* %store + ret void +} + +define void @conv_v2i32_to_i64( <2 x i32>* %load, <2 x i32>* %store ) { +; CHECK-LABEL: conv_v2i32_to_i64: +; CHECK: vrev64.32 + %v = load <2 x i32>* %load + %w = load <2 x i32>* @v2i32 + %a = add <2 x i32> %v, %w + %f = bitcast <2 x i32> %a to i64 + call void @conv_i64_to_v2i32( i64 %f, <2 x i32>* %store ) + ret void +} + +define void @conv_i64_to_v2f32( i64 %val, <2 x float>* %store ) { +; CHECK-LABEL: conv_i64_to_v2f32: +; CHECK: vrev64.32 + %v = bitcast i64 %val to <2 x float> + %w = load <2 x float>* @v2f32 + %a = fadd <2 x float> %v, %w + store <2 x float> %a, <2 x float>* %store + ret void +} + +define void @conv_v2f32_to_i64( <2 x float>* %load, <2 x float>* %store ) { +; CHECK-LABEL: conv_v2f32_to_i64: +; CHECK: vrev64.32 + %v = load <2 x float>* %load + %w = load <2 x float>* @v2f32 + %a = fadd <2 x float> %v, %w + %f = bitcast <2 x float> %a to i64 + call void @conv_i64_to_v2f32( i64 %f, <2 x float>* %store ) + ret void +} + +define void @conv_f64_to_v8i8( double %val, <8 x i8>* %store ) { +; CHECK-LABEL: conv_f64_to_v8i8: +; CHECK: vrev64.8 + %v = bitcast double %val to <8 x i8> + %w = load <8 x i8>* @v8i8 + %a = add <8 x i8> %v, %w + store <8 x i8> %a, <8 x i8>* %store + ret void +} + +define void @conv_v8i8_to_f64( <8 x i8>* %load, <8 x i8>* %store ) { +; CHECK-LABEL: conv_v8i8_to_f64: +; CHECK: vrev64.8 + %v = load <8 x i8>* %load + %w = load <8 x i8>* @v8i8 + %a = add <8 x i8> %v, %w + %f = bitcast <8 x i8> %a to double + call void @conv_f64_to_v8i8( double %f, <8 x i8>* %store ) + ret void +} + +define void @conv_f64_to_v4i16( double %val, <4 x i16>* %store ) { +; CHECK-LABEL: conv_f64_to_v4i16: +; CHECK: vrev64.16 + %v = bitcast double %val to <4 x i16> + %w = load <4 x i16>* @v4i16 + %a = add <4 x i16> %v, %w + store <4 x i16> %a, <4 x i16>* %store + ret void +} + +define void @conv_v4i16_to_f64( <4 x i16>* %load, <4 x i16>* %store ) { +; CHECK-LABEL: conv_v4i16_to_f64: +; CHECK: vrev64.16 + %v = load <4 x i16>* %load + %w = load <4 x i16>* @v4i16 + %a = add <4 x i16> %v, %w + %f = bitcast <4 x i16> %a to double + call void @conv_f64_to_v4i16( double %f, <4 x i16>* %store ) + ret void +} + +define void @conv_f64_to_v2i32( double %val, <2 x i32>* %store ) { +; CHECK-LABEL: conv_f64_to_v2i32: +; CHECK: vrev64.32 + %v = bitcast double %val to <2 x i32> + %w = load <2 x i32>* @v2i32 + %a = add <2 x i32> %v, %w + store <2 x i32> %a, <2 x i32>* %store + ret void +} + +define void @conv_v2i32_to_f64( <2 x i32>* %load, <2 x i32>* %store ) { +; CHECK-LABEL: conv_v2i32_to_f64: +; CHECK: vrev64.32 + %v = load <2 x i32>* %load + %w = load <2 x i32>* @v2i32 + %a = add <2 x i32> %v, %w + %f = bitcast <2 x i32> %a to double + call void @conv_f64_to_v2i32( double %f, <2 x i32>* %store ) + ret void +} + +define void @conv_f64_to_v2f32( double %val, <2 x float>* %store ) { +; CHECK-LABEL: conv_f64_to_v2f32: +; CHECK: vrev64.32 + %v = bitcast double %val to <2 x float> + %w = load <2 x float>* @v2f32 + %a = fadd <2 x float> %v, %w + store <2 x float> %a, <2 x float>* %store + ret void +} + +define void @conv_v2f32_to_f64( <2 x float>* %load, <2 x float>* %store ) { +; CHECK-LABEL: conv_v2f32_to_f64: +; CHECK: vrev64.32 + %v = load <2 x float>* %load + %w = load <2 x float>* @v2f32 + %a = fadd <2 x float> %v, %w + %f = bitcast <2 x float> %a to double + call void @conv_f64_to_v2f32( double %f, <2 x float>* %store ) + ret void +} + +; 128 bit conversions + + +define void @conv_i128_to_v16i8( i128 %val, <16 x i8>* %store ) { +; CHECK-LABEL: conv_i128_to_v16i8: +; CHECK: vrev32.8 + %v = bitcast i128 %val to <16 x i8> + %w = load <16 x i8>* @v16i8 + %a = add <16 x i8> %v, %w + store <16 x i8> %a, <16 x i8>* %store + ret void +} + +define void @conv_v16i8_to_i128( <16 x i8>* %load, <16 x i8>* %store ) { +; CHECK-LABEL: conv_v16i8_to_i128: +; CHECK: vrev32.8 + %v = load <16 x i8>* %load + %w = load <16 x i8>* @v16i8 + %a = add <16 x i8> %v, %w + %f = bitcast <16 x i8> %a to i128 + call void @conv_i128_to_v16i8( i128 %f, <16 x i8>* %store ) + ret void +} + +define void @conv_i128_to_v8i16( i128 %val, <8 x i16>* %store ) { +; CHECK-LABEL: conv_i128_to_v8i16: +; CHECK: vrev32.16 + %v = bitcast i128 %val to <8 x i16> + %w = load <8 x i16>* @v8i16 + %a = add <8 x i16> %v, %w + store <8 x i16> %a, <8 x i16>* %store + ret void +} + +define void @conv_v8i16_to_i128( <8 x i16>* %load, <8 x i16>* %store ) { +; CHECK-LABEL: conv_v8i16_to_i128: +; CHECK: vrev32.16 + %v = load <8 x i16>* %load + %w = load <8 x i16>* @v8i16 + %a = add <8 x i16> %v, %w + %f = bitcast <8 x i16> %a to i128 + call void @conv_i128_to_v8i16( i128 %f, <8 x i16>* %store ) + ret void +} + +define void @conv_i128_to_v4i32( i128 %val, <4 x i32>* %store ) { +; CHECK-LABEL: conv_i128_to_v4i32: +; CHECK: vrev64.32 + %v = bitcast i128 %val to <4 x i32> + %w = load <4 x i32>* @v4i32 + %a = add <4 x i32> %v, %w + store <4 x i32> %a, <4 x i32>* %store + ret void +} + +define void @conv_v4i32_to_i128( <4 x i32>* %load, <4 x i32>* %store ) { +; CHECK-LABEL: conv_v4i32_to_i128: +; CHECK: vrev64.32 + %v = load <4 x i32>* %load + %w = load <4 x i32>* @v4i32 + %a = add <4 x i32> %v, %w + %f = bitcast <4 x i32> %a to i128 + call void @conv_i128_to_v4i32( i128 %f, <4 x i32>* %store ) + ret void +} + +define void @conv_i128_to_v4f32( i128 %val, <4 x float>* %store ) { +; CHECK-LABEL: conv_i128_to_v4f32: +; CHECK: vrev64.32 + %v = bitcast i128 %val to <4 x float> + %w = load <4 x float>* @v4f32 + %a = fadd <4 x float> %v, %w + store <4 x float> %a, <4 x float>* %store + ret void +} + +define void @conv_v4f32_to_i128( <4 x float>* %load, <4 x float>* %store ) { +; CHECK-LABEL: conv_v4f32_to_i128: +; CHECK: vrev64.32 + %v = load <4 x float>* %load + %w = load <4 x float>* @v4f32 + %a = fadd <4 x float> %v, %w + %f = bitcast <4 x float> %a to i128 + call void @conv_i128_to_v4f32( i128 %f, <4 x float>* %store ) + ret void +} + +define void @conv_f128_to_v2f64( fp128 %val, <2 x double>* %store ) { +; CHECK-LABEL: conv_f128_to_v2f64: +; CHECK: vrev64.32 + %v = bitcast fp128 %val to <2 x double> + %w = load <2 x double>* @v2f64 + %a = fadd <2 x double> %v, %w + store <2 x double> %a, <2 x double>* %store + ret void +} + +define void @conv_v2f64_to_f128( <2 x double>* %load, <2 x double>* %store ) { +; CHECK-LABEL: conv_v2f64_to_f128: +; CHECK: vrev64.32 + %v = load <2 x double>* %load + %w = load <2 x double>* @v2f64 + %a = fadd <2 x double> %v, %w + %f = bitcast <2 x double> %a to fp128 + call void @conv_f128_to_v2f64( fp128 %f, <2 x double>* %store ) + ret void +} + +define void @conv_f128_to_v16i8( fp128 %val, <16 x i8>* %store ) { +; CHECK-LABEL: conv_f128_to_v16i8: +; CHECK: vrev32.8 + %v = bitcast fp128 %val to <16 x i8> + %w = load <16 x i8>* @v16i8 + %a = add <16 x i8> %v, %w + store <16 x i8> %a, <16 x i8>* %store + ret void +} + +define void @conv_v16i8_to_f128( <16 x i8>* %load, <16 x i8>* %store ) { +; CHECK-LABEL: conv_v16i8_to_f128: +; CHECK: vrev32.8 + %v = load <16 x i8>* %load + %w = load <16 x i8>* @v16i8 + %a = add <16 x i8> %v, %w + %f = bitcast <16 x i8> %a to fp128 + call void @conv_f128_to_v16i8( fp128 %f, <16 x i8>* %store ) + ret void +} + +define void @conv_f128_to_v8i16( fp128 %val, <8 x i16>* %store ) { +; CHECK-LABEL: conv_f128_to_v8i16: +; CHECK: vrev32.16 + %v = bitcast fp128 %val to <8 x i16> + %w = load <8 x i16>* @v8i16 + %a = add <8 x i16> %v, %w + store <8 x i16> %a, <8 x i16>* %store + ret void +} + +define void @conv_v8i16_to_f128( <8 x i16>* %load, <8 x i16>* %store ) { +; CHECK-LABEL: conv_v8i16_to_f128: +; CHECK: vrev32.16 + %v = load <8 x i16>* %load + %w = load <8 x i16>* @v8i16 + %a = add <8 x i16> %v, %w + %f = bitcast <8 x i16> %a to fp128 + call void @conv_f128_to_v8i16( fp128 %f, <8 x i16>* %store ) + ret void +} + +define void @conv_f128_to_v4f32( fp128 %val, <4 x float>* %store ) { +; CHECK-LABEL: conv_f128_to_v4f32: +; CHECK: vrev64.32 + %v = bitcast fp128 %val to <4 x float> + %w = load <4 x float>* @v4f32 + %a = fadd <4 x float> %v, %w + store <4 x float> %a, <4 x float>* %store + ret void +} + +define void @conv_v4f32_to_f128( <4 x float>* %load, <4 x float>* %store ) { +; CHECK-LABEL: conv_v4f32_to_f128: +; CHECK: vrev64.32 + %v = load <4 x float>* %load + %w = load <4 x float>* @v4f32 + %a = fadd <4 x float> %v, %w + %f = bitcast <4 x float> %a to fp128 + call void @conv_f128_to_v4f32( fp128 %f, <4 x float>* %store ) + ret void +} + Index: test/CodeGen/ARM/dagcombine-concatvector.ll =================================================================== --- test/CodeGen/ARM/dagcombine-concatvector.ll +++ test/CodeGen/ARM/dagcombine-concatvector.ll @@ -7,8 +7,8 @@ ; CHECK-LE-NEXT: vmov {{d[0-9]+}}, r1, r2 ; CHECK-LE-NEXT: vmov {{d[0-9]+}}, r3, [[REG]] ; CHECK-BE-NEXT: vmov {{d[0-9]+}}, r2, r1 -; CHECK-BE-NEXT: vmov {{d[0-9]+}}, [[REG]], r3 -; CHECK-NEXT: vst1.8 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0] +; CHECK-BE: vmov {{d[0-9]+}}, [[REG]], r3 +; CHECK: vst1.8 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0] ; CHECK-NEXT: bx lr define void @test1(i8* %arg, [4 x i64] %vec.coerce) { bb: Index: test/CodeGen/ARM/vcombine.ll =================================================================== --- test/CodeGen/ARM/vcombine.ll +++ test/CodeGen/ARM/vcombine.ll @@ -78,7 +78,7 @@ ; CHECK: vget_high8 ; CHECK-NOT: vst ; CHECK-LE: vmov r0, r1, d17 -; CHECK-BE: vmov r1, r0, d17 +; CHECK-BE: vmov r1, r0, d16 %tmp1 = load <16 x i8>* %A %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> ret <8 x i8> %tmp2