Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -21,6 +21,9 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto","crypto">; +def IsLE : Predicate<"Subtarget->isLittle()">; +def IsBE : Predicate<"!Subtarget->isLittle()">; + // Use fused MAC if more precision in FP computation is allowed. def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion ==" " FPOpFusion::Fast)">; @@ -4849,6 +4852,83 @@ : ls_neutral_pats, ls_atomic_pats; + +// wrappers to instantiate all allowed same-size fp/vector loads + +// NEON-BE: allow all neon vectors as well, since ld1/st1 must be disabled +// LD1 & ST1 are not ABI conforming in big endian: wrong arg memory layout +// TODO: eventually also enable for LE +// (desired by ARM - smaller code due to more powerful adressing modes) + +// neon 8 bit types +multiclass ls_FPR8_pats { + let Predicates = [HasNEON, IsBE] in { + defm : ls_neutral_pats; + } +} + +// neon 16 bit types +multiclass ls_FPR16_pats { + // float is always there + // shouldn't this be guarded by HasFPARMv8 ??? + defm : ls_neutral_pats; + + // TODO: eventually also enable for LE + let Predicates = [HasNEON, IsBE] in { + defm : ls_neutral_pats; + } +} + +// neon 32 bit types +multiclass ls_FPR32_pats { + // float is always there + defm : ls_neutral_pats; + + // TODO: eventually also enable for LE + let Predicates = [HasNEON, IsBE] in { + defm : ls_neutral_pats; +// defm : ls_neutral_pats; does not exist - v1f64 DOES -- WHY ? + } +} + +// neon 64 bit types +multiclass ls_FPR64_pats { + // float is always there + // shouldn't this be guarded by HasFPARMv8 ??? + defm : ls_neutral_pats; + + // TODO: eventually also enable for LE + let Predicates = [HasNEON, IsBE] in { + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + } +} + +// neon 128 bit types FPR128 +multiclass ls_FPR128_pats { + // float is always there + defm : ls_neutral_pats; + + // TODO: eventually also enable for LE + let Predicates = [HasNEON, IsBE] in { + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + defm : ls_neutral_pats; + } +} + //===------------------------------ // 2.2. Addressing-mode instantiations //===------------------------------ @@ -4892,37 +4972,40 @@ !subst(ALIGN, min_align8, decls.pattern))), i64>; - defm : ls_neutral_pats; + + defm : ls_FPR16_pats< LSFP16_LDR, LSFP16_STR, Base, + !foreach(decls.pattern, Offset, !subst(OFFSET, hword_uimm12, decls.pattern)), !foreach(decls.pattern, address, !subst(OFFSET, hword_uimm12, - !subst(ALIGN, min_align2, decls.pattern))), - f16>; + !subst(ALIGN, min_align2, decls.pattern)))>; - defm : ls_neutral_pats; + !subst(ALIGN, min_align4, decls.pattern)))>; - defm : ls_neutral_pats; + !subst(ALIGN, min_align8, decls.pattern)))>; - defm : ls_neutral_pats; + !subst(ALIGN, min_align16, decls.pattern)))>; defm : load_signed_pats<"B", "", Base, !foreach(decls.pattern, Offset, @@ -4992,11 +5075,10 @@ defm : ls_int_neutral_pats; defm : ls_int_neutral_pats; - defm : ls_neutral_pats; - defm : ls_neutral_pats; - defm : ls_neutral_pats; - defm : ls_neutral_pats; + defm : ls_FPR16_pats; + defm : ls_FPR32_pats; + defm : ls_FPR64_pats; + defm : ls_FPR128_pats; def : Pat<(i64 (zextloadi32 address)), (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>; Index: lib/Target/AArch64/AArch64InstrNEON.td =================================================================== --- lib/Target/AArch64/AArch64InstrNEON.td +++ lib/Target/AArch64/AArch64InstrNEON.td @@ -3379,15 +3379,27 @@ } // Load multiple N-element structure to N consecutive registers (N = 1,2,3,4) -defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">; + +// LD1 disallowed in BE, when LDR and STR are used exclusively to save on 12bit offset adds. +// reason: LDR/STR use different memory/register layout (no element swaps). +// If different types of loads were used from the same memory address the results +// will be inconsistent. +// the only allowed use of LD1 is in initializations using explicit intrinsics to do +// the element-swaps. + +// this should work in BE - single operand: no element swaps +let Predicates = [IsLE] in { def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">; -defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">; + defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">; -defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">; + defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">; -defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">; + defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">; + defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">; +} + // Load multiple 1-element structure to N consecutive registers (N = 2,3,4) defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">; def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">; @@ -3433,72 +3445,77 @@ } // Store multiple N-element structures from N registers (N = 1,2,3,4) -defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">; +// ARM ABI: default memory layout in BE is LDR/STR +// (LD1/ST1 swap elements and are incompatible with ABI) +let Predicates = [IsLE] in { def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">; -defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">; + defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">; -defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">; + defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">; -defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">; + defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">; + defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">; + // Store multiple 1-element structures from N consecutive registers (N = 2,3,4) -defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">; -def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">; + defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">; + def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">; -defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">; -def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">; + defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">; + def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">; -defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">; -def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">; + defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">; + def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">; -def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; -def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; + def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; + def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; -def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; -def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; + def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; + def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; -def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>; -def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>; + def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>; + def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>; -def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; -def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; + def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; + def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; -def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; -def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; + def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; + def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; -def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>; -def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>; + def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>; + def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>; -def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr), - (ST1_2D GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr), - (ST1_2D GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr), + (ST1_2D GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr), + (ST1_2D GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr), - (ST1_4S GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr), - (ST1_4S GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr), + (ST1_4S GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr), + (ST1_4S GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr), - (ST1_8H GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr), - (ST1_16B GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr), + (ST1_8H GPR64xsp:$addr, VPR128:$value)>; + def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr), + (ST1_16B GPR64xsp:$addr, VPR128:$value)>; -def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr), - (ST1_1D GPR64xsp:$addr, VPR64:$value)>; -def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr), - (ST1_1D GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr), + (ST1_1D GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr), + (ST1_1D GPR64xsp:$addr, VPR64:$value)>; -def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr), - (ST1_2S GPR64xsp:$addr, VPR64:$value)>; -def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr), - (ST1_2S GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr), + (ST1_2S GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr), + (ST1_2S GPR64xsp:$addr, VPR64:$value)>; -def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr), - (ST1_4H GPR64xsp:$addr, VPR64:$value)>; -def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr), - (ST1_8B GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr), + (ST1_4H GPR64xsp:$addr, VPR64:$value)>; + def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr), + (ST1_8B GPR64xsp:$addr, VPR64:$value)>; +} // Match load/store of v1i8/v1i16/v1i32 type to FPR8/FPR16/FPR32 load/store. // FIXME: for now we have v1i8, v1i16, v1i32 legal types, if they are illegal, @@ -3681,34 +3698,36 @@ ImmTy2, asmop>; } -// Post-index load multiple N-element structures from N registers (N = 1,2,3,4) -defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">; +let Predicates = [IsLE] in { defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, - "ld1">; + "ld1">; + // Post-index load multiple N-element structures from N registers (N = 1,2,3,4) + defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">; -defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">; + defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">; -defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, - "ld3">; + defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, + "ld3">; -defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">; + defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">; -// Post-index load multiple 1-element structures from N consecutive registers -// (N = 2,3,4) -defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, - "ld1">; -defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand, - uimm_exact16, "ld1">; + // Post-index load multiple 1-element structures from N consecutive registers + // (N = 2,3,4) + defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, + "ld1">; + defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand, + uimm_exact16, "ld1">; -defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, - "ld1">; -defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand, - uimm_exact24, "ld1">; + defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, + "ld1">; + defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand, + uimm_exact24, "ld1">; -defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, - "ld1">; -defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand, - uimm_exact32, "ld1">; + defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, + "ld1">; + defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand, + uimm_exact32, "ld1">; +} multiclass NeonI_STWB_VList opcode, bits<2> size, RegisterOperand VecList, Operand ImmTy, @@ -3764,33 +3783,35 @@ } // Post-index load multiple N-element structures from N registers (N = 1,2,3,4) -defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">; +let Predicates = [IsLE] in { defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, - "st1">; + "st1">; + defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">; -defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">; + defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">; -defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, - "st3">; + defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, + "st3">; -defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">; + defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">; -// Post-index load multiple 1-element structures from N consecutive registers -// (N = 2,3,4) -defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, - "st1">; -defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand, - uimm_exact16, "st1">; + // Post-index load multiple 1-element structures from N consecutive registers + // (N = 2,3,4) + defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, + "st1">; + defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand, + uimm_exact16, "st1">; -defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, - "st1">; -defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand, - uimm_exact24, "st1">; + defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, + "st1">; + defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand, + uimm_exact24, "st1">; -defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, - "st1">; -defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand, - uimm_exact32, "st1">; + defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, + "st1">; + defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand, + uimm_exact32, "st1">; +} // End of post-index vector load/store multiple N-element structure // (class SIMD lselem-post) @@ -3864,14 +3885,16 @@ !cast(List # "2D_operand"), asmop>; } +let Predicates = [IsLE] in { // Load single 1-element structure to all lanes of 1 register defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">; -// Load single N-element structure to all lanes of N consecutive -// registers (N = 2,3,4) -defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">; -defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">; -defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">; + // Load single N-element structure to all lanes of N consecutive + // registers (N = 2,3,4) + defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">; + defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">; + defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">; +} class LD1R_pattern ; // Match all LD1R instructions + +let Predicates = [IsLE] in { +// element swap on bytes == byte swap def : LD1R_pattern; - def : LD1R_pattern; -def : LD1R_pattern; + def : LD1R_pattern; -def : LD1R_pattern; + def : LD1R_pattern; -def : LD1R_pattern; -def : LD1R_pattern; + def : LD1R_pattern; + def : LD1R_pattern; -def : LD1R_pattern; -def : LD1R_pattern; + def : LD1R_pattern; + def : LD1R_pattern; -def : LD1R_pattern; -def : LD1R_pattern; + def : LD1R_pattern; + def : LD1R_pattern; +} class LD1R_pattern_v1 : Pat<(VTy (scalar_to_vector (DTy (LoadOp GPR64xsp:$Rn)))), (VTy (INST GPR64xsp:$Rn))>; -def : LD1R_pattern_v1; -def : LD1R_pattern_v1; +let Predicates = [IsLE] in { + def : LD1R_pattern_v1; + def : LD1R_pattern_v1; +} + multiclass VectorList_Bare_BHSD { defm B : VectorList_operands; @@ -3964,14 +3993,16 @@ } } -// Load single 1-element structure to one lane of 1 register. -defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">; +let Predicates = [IsLE] in { + // Load single 1-element structure to one lane of 1 register. + defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">; -// Load single N-element structure to one lane of N consecutive registers -// (N = 2,3,4) -defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">; -defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">; -defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">; + // Load single N-element structure to one lane of N consecutive registers + // (N = 2,3,4) + defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">; + defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">; + defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">; +} multiclass LD1LN_patterns; } -// Match all LD1LN instructions -defm : LD1LN_patterns; +let Predicates = [IsLE] in { + // Match all LD1LN instructions + defm : LD1LN_patterns; -defm : LD1LN_patterns; + defm : LD1LN_patterns; -defm : LD1LN_patterns; -defm : LD1LN_patterns; + defm : LD1LN_patterns; + defm : LD1LN_patterns; -defm : LD1LN_patterns; -defm : LD1LN_patterns; + defm : LD1LN_patterns; + defm : LD1LN_patterns; +} class NeonI_STN_Lane op2_1, bit op0, RegisterOperand VList, Operand ImmOp, string asmop> @@ -4048,14 +4081,16 @@ } } +let Predicates = [IsLE] in { // Store single 1-element structure from one lane of 1 register. defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">; -// Store single N-element structure from one lane of N consecutive registers -// (N = 2,3,4) -defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">; -defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">; -defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">; + // Store single N-element structure from one lane of N consecutive registers + // (N = 2,3,4) + defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">; + defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">; + defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">; +} multiclass ST1LN_patterns; } -// Match all ST1LN instructions -defm : ST1LN_patterns; +let Predicates = [IsLE] in { + // Match all ST1LN instructions + defm : ST1LN_patterns; -defm : ST1LN_patterns; + defm : ST1LN_patterns; -defm : ST1LN_patterns; -defm : ST1LN_patterns; + defm : ST1LN_patterns; + defm : ST1LN_patterns; -defm : ST1LN_patterns; -defm : ST1LN_patterns; - + defm : ST1LN_patterns; + defm : ST1LN_patterns; +} // End of vector load/store single N-element structure (class SIMD lsone). @@ -4153,18 +4189,20 @@ uimm_d, asmop>; } +let Predicates = [IsLE] in { // Post-index load single 1-element structure to all lanes of 1 register defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1, uimm_exact2, uimm_exact4, uimm_exact8>; -// Post-index load single N-element structure to all lanes of N consecutive -// registers (N = 2,3,4) -defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2, - uimm_exact4, uimm_exact8, uimm_exact16>; -defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3, - uimm_exact6, uimm_exact12, uimm_exact24>; -defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4, - uimm_exact8, uimm_exact16, uimm_exact32>; + // Post-index load single N-element structure to all lanes of N consecutive + // registers (N = 2,3,4) + defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; + defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; + defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; +} let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb, $Rt = $src", @@ -4253,18 +4291,20 @@ } // Post-index load single 1-element structure to one lane of 1 register. +let Predicates = [IsLE] in { defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1, uimm_exact2, uimm_exact4, uimm_exact8>; -// Post-index load single N-element structure to one lane of N consecutive -// registers -// (N = 2,3,4) -defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2, - uimm_exact4, uimm_exact8, uimm_exact16>; -defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3, - uimm_exact6, uimm_exact12, uimm_exact24>; -defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4, - uimm_exact8, uimm_exact16, uimm_exact32>; + // Post-index load single N-element structure to one lane of N consecutive + // registers + // (N = 2,3,4) + defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; + defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; + defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; +} let mayStore = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb", @@ -4353,17 +4393,19 @@ } // Post-index store single 1-element structure from one lane of 1 register. +let Predicates = [IsLE] in { defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1, uimm_exact2, uimm_exact4, uimm_exact8>; // Post-index store single N-element structure from one lane of N consecutive // registers (N = 2,3,4) -defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2, - uimm_exact4, uimm_exact8, uimm_exact16>; -defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3, - uimm_exact6, uimm_exact12, uimm_exact24>; -defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4, - uimm_exact8, uimm_exact16, uimm_exact32>; + defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; + defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; + defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; +} // End of post-index load/store single N-element instructions // (class SIMD lsone-post)