Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -21,6 +21,9 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto","crypto">; +def IsLE : Predicate<"Subtarget->isLittle()">; +def IsBE : Predicate<"!Subtarget->isLittle()">; + // Use fused MAC if more precision in FP computation is allowed. def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" " FPOpFusion::Fast)">; @@ -2612,13 +2615,13 @@ } let Predicates = [HasFPARMv8] in { -def LDRs_lit : A64I_LDRlitSimple<0b00, 0b1, FPR32>; -def LDRd_lit : A64I_LDRlitSimple<0b01, 0b1, FPR64>; + def LDRs_lit : A64I_LDRlitSimple<0b00, 0b1, FPR32>; + def LDRd_lit : A64I_LDRlitSimple<0b01, 0b1, FPR64>; } let mayLoad = 1 in { let Predicates = [HasFPARMv8] in { - def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>; + def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>; } def LDRSWx_lit : A64I_LDRlit<0b10, 0b0, @@ -5032,6 +5035,85 @@ : ls_neutral_pats, ls_atomic_pats; + +// Wrappers to instantiate all allowed same-size fp/vector loads + +// NEON-BE: allow all neon vectors as well, since ld1/st1 must be disabled +// LD1 & ST1 are not ABI conforming in big endian: wrong arg memory layout +// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf +// section 4.1.2, 2nd paragraph: LDR/STR layout +// "on a big-endian system element 0 will contain the highest-addressed +// element of a short vector." +// FIXME: eventually also enable for LE +// (desired by ARM - smaller code due to more powerful adressing modes) + +// NEON 8 bit types +multiclass ls_FPR8_pats { + let Predicates = [HasNEON] in { + defm : ls_neutral_pats; + } +} + +// NEON 16 bit types +multiclass ls_FPR16_pats { + let Predicates = [HasFPARMv8] in { + defm : ls_neutral_pats; + } + + let Predicates = [HasNEON] in { + defm : ls_neutral_pats; + } +} + +// NEON 32 bit types +multiclass ls_FPR32_pats { + let Predicates = [HasFPARMv8] in { + defm : ls_neutral_pats; + } + + let Predicates = [HasNEON] in { + defm : ls_neutral_pats; +// defm : ls_neutral_pats; does not exist - v1f64 DOES -- WHY ? + } +} + +// NEON 64 bit types +multiclass ls_FPR64_pats { + let Predicates = [HasFPARMv8] in { + defm : ls_neutral_pats; + } + + let Predicates = [HasNEON] in { + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + } +} + +// NEON 128 bit types FPR128 +multiclass ls_FPR128_pats { + let Predicates = [HasFPARMv8] in { + defm : ls_neutral_pats; + } + + let Predicates = [HasNEON] in { + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + } +} + //===------------------------------ // 2.2. Addressing-mode instantiations //===------------------------------ @@ -5075,37 +5157,40 @@ !subst(ALIGN, min_align8, decls.pattern))), i64>; - defm : ls_neutral_pats; + + defm : ls_FPR16_pats< LSFP16_LDR, LSFP16_STR, Base, + !foreach(decls.pattern, Offset, !subst(OFFSET, hword_uimm12, decls.pattern)), !foreach(decls.pattern, address, !subst(OFFSET, hword_uimm12, - !subst(ALIGN, min_align2, decls.pattern))), - f16>; + !subst(ALIGN, min_align2, decls.pattern)))>; - defm : ls_neutral_pats; + !subst(ALIGN, min_align4, decls.pattern)))>; - defm : ls_neutral_pats; + !subst(ALIGN, min_align8, decls.pattern)))>; - defm : ls_neutral_pats; + !subst(ALIGN, min_align16, decls.pattern)))>; defm : load_signed_pats<"B", "", Base, !foreach(decls.pattern, Offset, @@ -5175,11 +5260,10 @@ defm : ls_int_neutral_pats; defm : ls_int_neutral_pats; - defm : ls_neutral_pats; - defm : ls_neutral_pats; - defm : ls_neutral_pats; - defm : ls_neutral_pats; + defm : ls_FPR16_pats; + defm : ls_FPR32_pats; + defm : ls_FPR64_pats; + defm : ls_FPR128_pats; def : Pat<(i64 (zextloadi32 address)), (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>; Index: lib/Target/AArch64/AArch64InstrNEON.td =================================================================== --- lib/Target/AArch64/AArch64InstrNEON.td +++ lib/Target/AArch64/AArch64InstrNEON.td @@ -104,10 +104,19 @@ defm : ls_128_pats; } -defm : uimm12_neon_pats<(A64WrapperSmall - tconstpool:$Hi, tconstpool:$Lo12, ALIGN), - (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>; - +// LDR is only valid for little endian. +// In BE LDR needs correctly byte-swapped 128bit literals, so simple array +// initializers won't work right now. +// Big-endian must - for now - do the element swaps using vector intrinsics. +// That's an additional "add offset12" instruction, there. +// According to ARM, BE & LE should use intrinsics for initialization. +// That's also the only portable code. +// FIXME: BE could use vector-literal-swapping before emit pass. +let Predicates = [IsLE] in { // TODO: this will eventually be removed + defm : uimm12_neon_pats<(A64WrapperSmall + tconstpool:$Hi, tconstpool:$Lo12, ALIGN), + (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>; +} //===----------------------------------------------------------------------===// // Multiclasses //===----------------------------------------------------------------------===// @@ -3432,9 +3441,31 @@ // the three 64-bit vectors list {BA, DC, FE}. // E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three // 64-bit vectors list {DA, EB, FC}. -// Store instructions store multiple structure to N registers like load. +// Store instructions store multiple structure from N registers like load. +// +// Problem for Big Endian (BE): +// LD1/ST1 do "array" loads/stores - reading elements from ascending addresses +// into ascending indexes in the register, in big-endian byte-swapping is done +// per element. (hence LD1 & Co are sometimes referred to as "array loads".) +// +// LDR/STR read the whole register doing byte-swapping on the whole register +// in big-endian mode. +// +// Obviously the two layouts differ by reversing the elements so they can't be +// mixed without explicit element-swap operations in BE. +// +// The only overlap is reading single elements to registers: +// LDR i128/f128 - doing byte-swapping for the whole register. +// LD1/ST1 i128/f128 - also doing byte-swapping within the 128bit element. +// Analogously for stores. +// For this reason there are IsLE guards around the respective patterns, or - +// when no patterns are defined, yet - around the instruction definition. +// In a PBQP matcher, one would add a separate set of "reversed" nonterminals +// with the element swap operations as chain rules - and let the matcher find +// the optimal coverage. FIXME: How to do that here ? + class NeonI_LDVList opcode, bits<2> size, RegisterOperand VecList, string asmop> : NeonI_LdStMult; + +// LD1 disallowed in BE, when LDR and STR are used exclusively as per the ABI. +// reason: LDR/STR use different memory/register layout (no element swaps). +// If different types of loads were used from the same memory address the results +// will be inconsistent. +// The only allowed use of LD1 is in initializations using explicit intrinsics to do +// the element-swaps. + +// Single element has no swapping problem in BE. def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">; -defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">; +// Multiple elements would be reversed in BE. +let Predicates = [IsLE] in { + defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">; -defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">; + defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">; -defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">; + defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">; + defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">; +} + // Load multiple 1-element structure to N consecutive registers (N = 2,3,4) defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">; def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">; @@ -3526,73 +3570,79 @@ } // Store multiple N-element structures from N registers (N = 1,2,3,4) -defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">; +// ARM ABI: default memory layout in BE is LDR/STR +// Single element has no swapping problem in BE. def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">; -defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">; +// Multiple elements would be reversed in BE. +let Predicates = [IsLE] in { + defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">; -defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">; + defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">; -defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">; + defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">; -// Store multiple 1-element structures from N consecutive registers (N = 2,3,4) -defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">; -def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">; + defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">; -defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">; -def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">; + // Store multiple 1-element structures from N consecutive registers (N = 2,3,4) + defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">; + def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">; -defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">; -def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">; + defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">; + def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">; -def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; -def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; + defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">; + def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">; -def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; -def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; + def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; + def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; -def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>; -def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>; + def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; + def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; -def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; -def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; + def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>; + def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>; -def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; -def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; + def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; + def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; -def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>; -def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>; + def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; + def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; -def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr), - (ST1_2D GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr), - (ST1_2D GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>; + def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>; -def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr), - (ST1_4S GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr), - (ST1_4S GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr), + (ST1_2D GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr), + (ST1_2D GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr), - (ST1_8H GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr), - (ST1_16B GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr), + (ST1_4S GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr), + (ST1_4S GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr), - (ST1_1D GPR64xsp:$addr, VPR64:$value)>; -def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr), - (ST1_1D GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr), + (ST1_8H GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr), + (ST1_16B GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr), - (ST1_2S GPR64xsp:$addr, VPR64:$value)>; -def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr), - (ST1_2S GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr), + (ST1_1D GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr), + (ST1_1D GPR64xsp:$addr, VPR64:$value)>; -def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr), - (ST1_4H GPR64xsp:$addr, VPR64:$value)>; -def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr), - (ST1_8B GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr), + (ST1_2S GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr), + (ST1_2S GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr), + (ST1_4H GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr), + (ST1_8B GPR64xsp:$addr, VPR64:$value)>; +} + // Match load/store of v1i8/v1i16/v1i32 type to FPR8/FPR16/FPR32 load/store. // FIXME: for now we have v1i8, v1i16, v1i32 legal types, if they are illegal, // these patterns are not needed any more. @@ -3776,35 +3826,40 @@ ImmTy2, asmop>; } -// Post-index load multiple N-element structures from N registers (N = 1,2,3,4) -defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">; +// Single element loads are ok for BE. defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, "ld1">; -defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">; +// Multiple elements would be reversed in BE. +let Predicates = [IsLE] in { + // Post-index load multiple N-element structures from N registers (N = 1,2,3,4) + defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">; -defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, - "ld3">; + defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">; -defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">; + defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, + "ld3">; -// Post-index load multiple 1-element structures from N consecutive registers -// (N = 2,3,4) -defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, - "ld1">; -defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand, - uimm_exact16, "ld1">; + defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">; -defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, - "ld1">; -defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand, - uimm_exact24, "ld1">; + // Post-index load multiple 1-element structures to N consecutive registers + // (N = 2,3,4) + defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, + "ld1">; + defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand, + uimm_exact16, "ld1">; -defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, - "ld1">; -defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand, - uimm_exact32, "ld1">; + defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, + "ld1">; + defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand, + uimm_exact24, "ld1">; + defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, + "ld1">; + defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand, + uimm_exact32, "ld1">; +} + multiclass NeonI_STWB_VList opcode, bits<2> size, RegisterOperand VecList, Operand ImmTy, string asmop> { @@ -3861,33 +3916,36 @@ } // Post-index load multiple N-element structures from N registers (N = 1,2,3,4) -defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">; -defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, - "st1">; +// Loading multiple elements in BE mode suffers from element-reversal. +let Predicates = [IsLE] in { + defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, + "st1">; + defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">; -defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">; + defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">; -defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, - "st3">; + defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, + "st3">; -defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">; + defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">; -// Post-index load multiple 1-element structures from N consecutive registers -// (N = 2,3,4) -defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, - "st1">; -defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand, - uimm_exact16, "st1">; + // Post-index load multiple 1-element structures from N consecutive registers + // (N = 2,3,4) + defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, + "st1">; + defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand, + uimm_exact16, "st1">; -defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, - "st1">; -defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand, - uimm_exact24, "st1">; + defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, + "st1">; + defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand, + uimm_exact24, "st1">; -defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, - "st1">; -defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand, - uimm_exact32, "st1">; + defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, + "st1">; + defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand, + uimm_exact32, "st1">; +} // End of post-index vector load/store multiple N-element structure // (class SIMD lselem-post) @@ -3963,13 +4021,17 @@ } // Load single 1-element structure to all lanes of 1 register +// Single element loads are fine in BE defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">; // Load single N-element structure to all lanes of N consecutive // registers (N = 2,3,4) -defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">; -defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">; -defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">; +// Multi-element loads suffer from element reversal in BE. +let Predicates = [IsLE] in { + defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">; + defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">; + defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">; +} class LD1R_pattern ; // Match all LD1R instructions -def : LD1R_pattern; +// This won't work as intended in BE mode, as STR q0 stored the elements swapped. +let Predicates = [IsLE] in { + def : LD1R_pattern; + def : LD1R_pattern; -def : LD1R_pattern; + def : LD1R_pattern; -def : LD1R_pattern; + def : LD1R_pattern; -def : LD1R_pattern; + def : LD1R_pattern; + def : LD1R_pattern; -def : LD1R_pattern; -def : LD1R_pattern; + def : LD1R_pattern; + def : LD1R_pattern; -def : LD1R_pattern; -def : LD1R_pattern; + def : LD1R_pattern; + def : LD1R_pattern; +} -def : LD1R_pattern; -def : LD1R_pattern; - class LD1R_pattern_v1 : Pat<(VTy (scalar_to_vector (DTy (LoadOp GPR64xsp:$Rn)))), (VTy (INST GPR64xsp:$Rn))>; +// Single element operations are swap-safe in BE. def : LD1R_pattern_v1; def : LD1R_pattern_v1; @@ -4064,46 +4129,64 @@ } // Load single 1-element structure to one lane of 1 register. +// No dangerous element swaps in BE. :-) defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">; // Load single N-element structure to one lane of N consecutive registers // (N = 2,3,4) -defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">; -defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">; -defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">; +// +// This will not work as intended in BE mode, if the matcher generates it to +// load a vector to a lane. (STR q0 stored the vector's elements swapped) +// Must always use an intrinsic, so the user knows it's loading from an array +// layout. +let Predicates = [IsLE] in { + defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">; + defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">; + defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">; +} -multiclass LD1LN_patterns { - def : Pat<(VTy (vector_insert (VTy VPR64:$src), - (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))), - (VTy (EXTRACT_SUBREG - (INST GPR64xsp:$Rn, - (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), - ImmOp:$lane), - sub_64))>; +// Multiple elements would be reversed in BE. +let Predicates = [IsLE] in { + multiclass LD1LN_patterns { + def : Pat<(VTy (vector_insert (VTy VPR64:$src), + (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))), + (VTy (EXTRACT_SUBREG + (INST GPR64xsp:$Rn, + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + ImmOp:$lane), + sub_64))>; - def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src), - (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))), - (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>; + def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src), + (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))), + (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>; + } } // Match all LD1LN instructions -defm : LD1LN_patterns; +// +// This will not work as intended in BE mode, if the matcher generates it to +// load a vector to a lane. (STR q0 stored the elements swapped in BE) +// Must always use an intrinsic, so the user knows it's loading from an array +// layout. +let Predicates = [IsLE] in { + defm : LD1LN_patterns; -defm : LD1LN_patterns; + defm : LD1LN_patterns; -defm : LD1LN_patterns; -defm : LD1LN_patterns; + defm : LD1LN_patterns; + defm : LD1LN_patterns; -defm : LD1LN_patterns; -defm : LD1LN_patterns; + defm : LD1LN_patterns; + defm : LD1LN_patterns; +} class NeonI_STN_Lane op2_1, bit op0, RegisterOperand VList, Operand ImmOp, string asmop> @@ -4149,13 +4232,17 @@ } // Store single 1-element structure from one lane of 1 register. +// single element should be fine in BE - no swapping of elements. defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">; // Store single N-element structure from one lane of N consecutive registers // (N = 2,3,4) -defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">; -defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">; -defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">; +// Multiple elements would be reversed in BE. +let Predicates = [IsLE] in { + defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">; + defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">; + defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">; +} multiclass ST1LN_patterns; +// +// Multiple elements would be reversed in BE. +let Predicates = [IsLE] in { + defm : ST1LN_patterns; -defm : ST1LN_patterns; + defm : ST1LN_patterns; -defm : ST1LN_patterns; -defm : ST1LN_patterns; + defm : ST1LN_patterns; + defm : ST1LN_patterns; -defm : ST1LN_patterns; -defm : ST1LN_patterns; - + defm : ST1LN_patterns; + defm : ST1LN_patterns; +} // End of vector load/store single N-element structure (class SIMD lsone). @@ -4256,17 +4346,21 @@ } // Post-index load single 1-element structure to all lanes of 1 register +// one element duplication should be fine in BE - no swapping of elements. defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1, uimm_exact2, uimm_exact4, uimm_exact8>; // Post-index load single N-element structure to all lanes of N consecutive // registers (N = 2,3,4) -defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2, - uimm_exact4, uimm_exact8, uimm_exact16>; -defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3, - uimm_exact6, uimm_exact12, uimm_exact24>; -defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4, - uimm_exact8, uimm_exact16, uimm_exact32>; +// Multiple elements would be reversed in BE. +let Predicates = [IsLE] in { + defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; + defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; + defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; +} let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb, $Rt = $src", @@ -4357,18 +4451,22 @@ } // Post-index load single 1-element structure to one lane of 1 register. +// One element from 1 lane is fine in BE - no swapping of elements. defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1, uimm_exact2, uimm_exact4, uimm_exact8>; // Post-index load single N-element structure to one lane of N consecutive // registers // (N = 2,3,4) -defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2, - uimm_exact4, uimm_exact8, uimm_exact16>; -defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3, - uimm_exact6, uimm_exact12, uimm_exact24>; -defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4, - uimm_exact8, uimm_exact16, uimm_exact32>; +// Multiple elements would be reversed in BE. +let Predicates = [IsLE] in { + defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; + defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; + defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; +} let mayStore = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb", @@ -4459,17 +4557,21 @@ } // Post-index store single 1-element structure from one lane of 1 register. +// one element from 1 lane should be fine in BE - no swapping of elements. defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1, uimm_exact2, uimm_exact4, uimm_exact8>; // Post-index store single N-element structure from one lane of N consecutive // registers (N = 2,3,4) -defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2, - uimm_exact4, uimm_exact8, uimm_exact16>; -defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3, - uimm_exact6, uimm_exact12, uimm_exact24>; -defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4, - uimm_exact8, uimm_exact16, uimm_exact32>; +// Multiple elements would be reversed in BE. +let Predicates = [IsLE] in { + defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; + defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; + defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; +} // End of post-index load/store single N-element instructions // (class SIMD lsone-post) Index: test/CodeGen/AArch64/128bit_load_store.ll =================================================================== --- test/CodeGen/AArch64/128bit_load_store.ll +++ test/CodeGen/AArch64/128bit_load_store.ll @@ -1,5 +1,118 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=neon | FileCheck %s ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s +define void @test_store_v1i8(<1 x i8>* %ptr, <1 x i8> %val) #0 { +; CHECK: test_store_v1i8 +; CHECK: str {{b[0-9]+}}, [{{x[0-9]+}}] +entry: + store <1 x i8> %val, <1 x i8>* %ptr, align 8 + ret void +} + + + +define void @test_store_f16(half* %ptr, half %val) #0 { +; CHECK: test_store_f16 +; CHECK: str {{h[0-9]+}}, [{{x[0-9]+}}] +entry: + store half %val, half* %ptr, align 8 + ret void +} + +define void @test_store_v1i16(<1 x i16>* %ptr, <1 x i16> %val) #0 { +; CHECK: test_store_v1i16 +; CHECK: str {{h[0-9]+}}, [{{x[0-9]+}}] +entry: + store <1 x i16> %val, <1 x i16>* %ptr, align 8 + ret void +} + + + +define void @test_store_f32(float* %ptr, float %val) #0 { +; CHECK: test_store_f32 +; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}] +entry: + store float %val, float* %ptr, align 8 + ret void +} + +define void @test_store_v1f32(<1 x float>* %ptr, <1 x float> %val) #0 { +; CHECK: test_store_v1f32 +; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}] +entry: + store <1 x float> %val, <1 x float>* %ptr, align 8 + ret void +} + +define void @test_store_v1i32(<1 x i32>* %ptr, <1 x i32> %val) #0 { +; CHECK: test_store_v1i32 +; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}] +entry: + store <1 x i32> %val, <1 x i32>* %ptr, align 8 + ret void +} + + +define void @test_store_f64(double *%ptr, double %val) #0 { +; CHECK: test_store_f64 +; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}] +entry: + store double %val, double* %ptr, align 8 + ret void +} + +define void @test_store_v1f64(<1 x double>* %ptr, <1 x double> %val) #0 { +; CHECK: test_store_v1f64 +; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}] +entry: + store <1 x double> %val, <1 x double>* %ptr, align 8 + ret void +} + +define void @test_store_v2f32(<2 x float>* %ptr, <2 x float> %val) #0 { +; CHECK: test_store_v2f32 +; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}] +entry: + store <2 x float> %val, <2 x float>* %ptr, align 8 + ret void +} + +define void @test_store_v1i64(<1 x i64>* %ptr, <1 x i64> %val) #0 { +; CHECK: test_store_v1i64 +; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}] +entry: + store <1 x i64> %val, <1 x i64>* %ptr, align 8 + ret void +} + +define void @test_store_v2i32(<2 x i32>* %ptr, <2 x i32> %val) #0 { +; CHECK: test_store_v2i32 +; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}] +entry: + store <2 x i32> %val, <2 x i32>* %ptr, align 8 + ret void +} + +define void @test_store_v4i16(<4 x i16>* %ptr, <4 x i16> %val) #0 { +; CHECK: test_store_v4i16 +; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}] +entry: + store <4 x i16> %val, <4 x i16>* %ptr, align 8 + ret void +} + +define void @test_store_v8i8(<8 x i8>* %ptr, <8 x i8> %val) #0 { +; CHECK: test_store_v8i8 +; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}] +entry: + store <8 x i8> %val, <8 x i8>* %ptr, align 8 + ret void +} + + + + define void @test_store_f128(fp128* %ptr, fp128 %val) #0 { ; CHECK: test_store_f128 ; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}] @@ -8,6 +121,54 @@ ret void } +define void @test_store_v2f64(<2 x double>* %ptr, <2 x double> %val) #0 { +; CHECK: test_store_v2f64 +; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}] +entry: + store <2 x double> %val, <2 x double>* %ptr, align 16 + ret void +} + +define void @test_store_v4f32(<4 x float>* %ptr, <4 x float> %val) #0 { +; CHECK: test_store_v4f32 +; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}] +entry: + store <4 x float> %val, <4 x float>* %ptr, align 16 + ret void +} + +define void @test_store_v2i64(<2 x i64>* %ptr, <2 x i64> %val) #0 { +; CHECK: test_store_v2i64 +; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}] +entry: + store <2 x i64> %val, <2 x i64>* %ptr, align 16 + ret void +} + +define void @test_store_v4i32(<4 x i32>* %ptr, <4 x i32> %val) #0 { +; CHECK: test_store_v4i32 +; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}] +entry: + store <4 x i32> %val, <4 x i32>* %ptr, align 16 + ret void +} + +define void @test_store_v8i16(<8 x i16>* %ptr, <8 x i16> %val) #0 { +; CHECK: test_store_v8i16 +; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}] +entry: + store <8 x i16> %val, <8 x i16>* %ptr, align 16 + ret void +} + +define void @test_store_v16i8(<16 x i8>* %ptr, <16 x i8> %val) #0 { +; CHECK: test_store_v16i8 +; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}] +entry: + store <16 x i8> %val, <16 x i8>* %ptr, align 16 + ret void +} + define fp128 @test_load_f128(fp128* readonly %ptr) #2 { ; CHECK: test_load_f128 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}] Index: test/CodeGen/AArch64/neon-copy.ll =================================================================== --- test/CodeGen/AArch64/neon-copy.ll +++ test/CodeGen/AArch64/neon-copy.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) { @@ -859,7 +860,8 @@ define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) { ; CHECK-LABEL: test_dup_v1i64_v4i16: -; CHECK: dup v0.4h, v0.h[0] +; CHECK-LE: dup v0.4h, v0.h[0] +; CHECK-BE: dup v0.4h, v0.h[2] entry: %x = extractelement <1 x i64> %a, i32 0 %vget_lane = trunc i64 %x to i16 @@ -883,7 +885,8 @@ define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v8i16: -; CHECK: dup v0.8h, v0.h[4] +; CHECK-LE: dup v0.8h, v0.h[4] +; CHECK-BE: dup v0.8h, v0.h[6] entry: %x = extractelement <2 x i64> %a, i32 1 %vget_lane = trunc i64 %x to i16 @@ -926,7 +929,8 @@ define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v4i16: -; CHECK: dup v0.4h, v0.h[0] +; CHECK-LE: dup v0.4h, v0.h[0] +; CHECK-BE: dup v0.4h, v0.h[2] entry: %x = extractelement <2 x i64> %a, i32 0 %vget_lane = trunc i64 %x to i16 Index: test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll =================================================================== --- test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll +++ test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll @@ -1,3 +1,4 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) { @@ -2,4 +3,4 @@ ; CHECK-LABEL: test_ldst1_v16i8: -; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] -; CHECK: st1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}] +; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}] %tmp = load <16 x i8>* %ptr @@ -11,8 +12,8 @@ define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) { ; CHECK-LABEL: test_ldst1_v8i16: -; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] -; CHECK: st1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}] +; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}] %tmp = load <8 x i16>* %ptr store <8 x i16> %tmp, <8 x i16>* %ptr2 ret void @@ -20,8 +21,8 @@ define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) { ; CHECK-LABEL: test_ldst1_v4i32: -; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] -; CHECK: st1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}] +; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}] %tmp = load <4 x i32>* %ptr store <4 x i32> %tmp, <4 x i32>* %ptr2 ret void @@ -29,8 +30,8 @@ define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) { ; CHECK-LABEL: test_ldst1_v2i64: -; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] -; CHECK: st1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}] +; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}] %tmp = load <2 x i64>* %ptr store <2 x i64> %tmp, <2 x i64>* %ptr2 ret void @@ -38,8 +39,8 @@ define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) { ; CHECK-LABEL: test_ldst1_v8i8: -; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] -; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] +; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}] %tmp = load <8 x i8>* %ptr store <8 x i8> %tmp, <8 x i8>* %ptr2 ret void @@ -47,8 +48,8 @@ define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) { ; CHECK-LABEL: test_ldst1_v4i16: -; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] -; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] +; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}] %tmp = load <4 x i16>* %ptr store <4 x i16> %tmp, <4 x i16>* %ptr2 ret void @@ -56,8 +57,8 @@ define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) { ; CHECK-LABEL: test_ldst1_v2i32: -; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] -; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] +; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}] %tmp = load <2 x i32>* %ptr store <2 x i32> %tmp, <2 x i32>* %ptr2 ret void @@ -65,8 +66,8 @@ define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) { ; CHECK-LABEL: test_ldst1_v1i64: -; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] -; CHECK: st1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}] +; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}] %tmp = load <1 x i64>* %ptr store <1 x i64> %tmp, <1 x i64>* %ptr2 ret void Index: test/CodeGen/AArch64/neon-truncStore-extLoad.ll =================================================================== --- test/CodeGen/AArch64/neon-truncStore-extLoad.ll +++ test/CodeGen/AArch64/neon-truncStore-extLoad.ll @@ -1,3 +1,4 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s ; A vector TruncStore can not be selected. @@ -5,7 +6,7 @@ define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) { ; CHECK-LABEL: truncStore.v2i64: ; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d -; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}] +; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}] %b = trunc <2 x i64> %a to <2 x i32> store <2 x i32> %b, <2 x i32>* %result ret void @@ -14,7 +15,7 @@ define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) { ; CHECK-LABEL: truncStore.v4i32: ; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s -; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}] +; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}] %b = trunc <4 x i32> %a to <4 x i16> store <4 x i16> %b, <4 x i16>* %result ret void @@ -23,7 +24,7 @@ define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) { ; CHECK-LABEL: truncStore.v8i16: ; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h -; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}] +; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}] %b = trunc <8 x i16> %a to <8 x i8> store <8 x i8> %b, <8 x i8>* %result ret void @@ -54,4 +55,4 @@ %vecext = extractelement <4 x i8> %a, i32 0 %conv = zext i8 %vecext to i32 ret i32 %conv -} \ No newline at end of file +}