Index: lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- lib/Target/AArch64/AArch64InstrFormats.td +++ lib/Target/AArch64/AArch64InstrFormats.td @@ -733,6 +733,13 @@ let PrintMethod = "printVectorIndex"; let MIOperandInfo = (ops i64imm); } +def VectorIndexBNon0 : Operand, ImmLeaf 0); +}]> { + let ParserMatchClass = VectorIndexBOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} def VectorIndexH : Operand, ImmLeaf { @@ -740,6 +747,13 @@ let PrintMethod = "printVectorIndex"; let MIOperandInfo = (ops i64imm); } +def VectorIndexHNon0 : Operand, ImmLeaf 0); +}]> { + let ParserMatchClass = VectorIndexHOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} def VectorIndexS : Operand, ImmLeaf { @@ -747,6 +761,13 @@ let PrintMethod = "printVectorIndex"; let MIOperandInfo = (ops i64imm); } +def VectorIndexSNon0 : Operand, ImmLeaf 0); +}]> { + let ParserMatchClass = VectorIndexSOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} def VectorIndexD : Operand, ImmLeaf { @@ -754,6 +775,13 @@ let PrintMethod = "printVectorIndex"; let MIOperandInfo = (ops i64imm); } +def VectorIndexDNon0 : Operand, ImmLeaf 0); +}]> { + let ParserMatchClass = VectorIndexDOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} // 8-bit immediate for AdvSIMD where 64-bit values of the form: // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -1867,6 +1867,32 @@ } } // AddedComplexity = 10 +// Match stores from lane 0 to the right subreg's store. +multiclass VecROStoreLane0Pat { + + def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), + (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), + (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), + (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), + (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 15 in { + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; +} + //--- // (unsigned immediate) defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str", @@ -1925,6 +1951,12 @@ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; +let AddedComplexity = 15 in +def : Pat<(store (i64 (vector_extract (v2i64 VecListOne128:$Vt), 0)), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui (EXTRACT_SUBREG VecListOne128:$Vt, dsub), + GPR64sp:$Rn, uimm12s8:$offset)>; + // Match all store 128 bits width whose type is compatible with FPR128 let Predicates = [IsLE] in { // We must use ST1 to store vectors in big-endian. @@ -4768,7 +4800,7 @@ defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>; defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>; -let AddedComplexity = 15 in +let AddedComplexity = 19 in class St1Lane128Pat : Pat<(scalar_store @@ -4777,14 +4809,14 @@ (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>; def : St1Lane128Pat; -def : St1Lane128Pat; -def : St1Lane128Pat; -def : St1Lane128Pat; -def : St1Lane128Pat; -def : St1Lane128Pat; -def : St1Lane128Pat; +def : St1Lane128Pat; +def : St1Lane128Pat; +def : St1Lane128Pat; +def : St1Lane128Pat; +def : St1Lane128Pat; +def : St1Lane128Pat; -let AddedComplexity = 15 in +let AddedComplexity = 19 in class St1Lane64Pat : Pat<(scalar_store @@ -4794,10 +4826,10 @@ VecIndex:$idx, GPR64sp:$Rn)>; def : St1Lane64Pat; -def : St1Lane64Pat; -def : St1Lane64Pat; -def : St1Lane64Pat; -def : St1Lane64Pat; +def : St1Lane64Pat; +def : St1Lane64Pat; +def : St1Lane64Pat; +def : St1Lane64Pat; multiclass St1LanePost64Pat %b) { ; CHECK-LABEL: test_vst1_lane_s64: -; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0] +; CHECK: str {{d[0-9]+}}, [x0] entry: %0 = extractelement <1 x i64> %b, i32 0 store i64 %0, i64* %a, align 8 Index: test/CodeGen/AArch64/arm64-st1.ll =================================================================== --- test/CodeGen/AArch64/arm64-st1.ll +++ test/CodeGen/AArch64/arm64-st1.ll @@ -8,6 +8,26 @@ ret void } +define void @st1lane_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_16b +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.b { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i8* %D, i64 %offset + %tmp = extractelement <16 x i8> %A, i32 1 + store i8 %tmp, i8* %ptr + ret void +} + +define void @st1lane0_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_16b +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.b { v0 }[0], [x[[XREG]]] + %ptr = getelementptr i8* %D, i64 %offset + %tmp = extractelement <16 x i8> %A, i32 0 + store i8 %tmp, i8* %ptr + ret void +} + define void @st1lane_8h(<8 x i16> %A, i16* %D) { ; CHECK-LABEL: st1lane_8h ; CHECK: st1.h @@ -16,6 +36,25 @@ ret void } +define void @st1lane_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_8h +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.h { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i16* %D, i64 %offset + %tmp = extractelement <8 x i16> %A, i32 1 + store i16 %tmp, i16* %ptr + ret void +} + +define void @st1lane0_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_8h +; CHECK: str h0, [x0, x1, lsl #1] + %ptr = getelementptr i16* %D, i64 %offset + %tmp = extractelement <8 x i16> %A, i32 0 + store i16 %tmp, i16* %ptr + ret void +} + define void @st1lane_4s(<4 x i32> %A, i32* %D) { ; CHECK-LABEL: st1lane_4s ; CHECK: st1.s @@ -24,6 +63,25 @@ ret void } +define void @st1lane_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_4s +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.s { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i32* %D, i64 %offset + %tmp = extractelement <4 x i32> %A, i32 1 + store i32 %tmp, i32* %ptr + ret void +} + +define void @st1lane0_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_4s +; CHECK: str s0, [x0, x1, lsl #2] + %ptr = getelementptr i32* %D, i64 %offset + %tmp = extractelement <4 x i32> %A, i32 0 + store i32 %tmp, i32* %ptr + ret void +} + define void @st1lane_4s_float(<4 x float> %A, float* %D) { ; CHECK-LABEL: st1lane_4s_float ; CHECK: st1.s @@ -32,6 +90,25 @@ ret void } +define void @st1lane_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_4s_float +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.s { v0 }[1], [x[[XREG]]] + %ptr = getelementptr float* %D, i64 %offset + %tmp = extractelement <4 x float> %A, i32 1 + store float %tmp, float* %ptr + ret void +} + +define void @st1lane0_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_4s_float +; CHECK: str s0, [x0, x1, lsl #2] + %ptr = getelementptr float* %D, i64 %offset + %tmp = extractelement <4 x float> %A, i32 0 + store float %tmp, float* %ptr + ret void +} + define void @st1lane_2d(<2 x i64> %A, i64* %D) { ; CHECK-LABEL: st1lane_2d ; CHECK: st1.d @@ -40,6 +117,25 @@ ret void } +define void @st1lane_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_2d +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.d { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i64* %D, i64 %offset + %tmp = extractelement <2 x i64> %A, i32 1 + store i64 %tmp, i64* %ptr + ret void +} + +define void @st1lane0_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_2d +; CHECK: str d0, [x0, x1, lsl #3] + %ptr = getelementptr i64* %D, i64 %offset + %tmp = extractelement <2 x i64> %A, i32 0 + store i64 %tmp, i64* %ptr + ret void +} + define void @st1lane_2d_double(<2 x double> %A, double* %D) { ; CHECK-LABEL: st1lane_2d_double ; CHECK: st1.d @@ -48,6 +144,25 @@ ret void } +define void @st1lane_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_2d_double +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.d { v0 }[1], [x[[XREG]]] + %ptr = getelementptr double* %D, i64 %offset + %tmp = extractelement <2 x double> %A, i32 1 + store double %tmp, double* %ptr + ret void +} + +define void @st1lane0_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_2d_double +; CHECK: str d0, [x0, x1, lsl #3] + %ptr = getelementptr double* %D, i64 %offset + %tmp = extractelement <2 x double> %A, i32 0 + store double %tmp, double* %ptr + ret void +} + define void @st1lane_8b(<8 x i8> %A, i8* %D) { ; CHECK-LABEL: st1lane_8b ; CHECK: st1.b @@ -56,6 +171,26 @@ ret void } +define void @st1lane_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_8b +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.b { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i8* %D, i64 %offset + %tmp = extractelement <8 x i8> %A, i32 1 + store i8 %tmp, i8* %ptr + ret void +} + +define void @st1lane0_ro_8b(<8 x i8> %A, i8* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_8b +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.b { v0 }[0], [x[[XREG]]] + %ptr = getelementptr i8* %D, i64 %offset + %tmp = extractelement <8 x i8> %A, i32 0 + store i8 %tmp, i8* %ptr + ret void +} + define void @st1lane_4h(<4 x i16> %A, i16* %D) { ; CHECK-LABEL: st1lane_4h ; CHECK: st1.h @@ -64,6 +199,25 @@ ret void } +define void @st1lane_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_4h +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.h { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i16* %D, i64 %offset + %tmp = extractelement <4 x i16> %A, i32 1 + store i16 %tmp, i16* %ptr + ret void +} + +define void @st1lane0_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_4h +; CHECK: str h0, [x0, x1, lsl #1] + %ptr = getelementptr i16* %D, i64 %offset + %tmp = extractelement <4 x i16> %A, i32 0 + store i16 %tmp, i16* %ptr + ret void +} + define void @st1lane_2s(<2 x i32> %A, i32* %D) { ; CHECK-LABEL: st1lane_2s ; CHECK: st1.s @@ -72,6 +226,25 @@ ret void } +define void @st1lane_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_2s +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.s { v0 }[1], [x[[XREG]]] + %ptr = getelementptr i32* %D, i64 %offset + %tmp = extractelement <2 x i32> %A, i32 1 + store i32 %tmp, i32* %ptr + ret void +} + +define void @st1lane0_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_2s +; CHECK: str s0, [x0, x1, lsl #2] + %ptr = getelementptr i32* %D, i64 %offset + %tmp = extractelement <2 x i32> %A, i32 0 + store i32 %tmp, i32* %ptr + ret void +} + define void @st1lane_2s_float(<2 x float> %A, float* %D) { ; CHECK-LABEL: st1lane_2s_float ; CHECK: st1.s @@ -80,6 +253,25 @@ ret void } +define void @st1lane_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) { +; CHECK-LABEL: st1lane_ro_2s_float +; CHECK: add x[[XREG:[0-9]+]], x0, x1 +; CHECK: st1.s { v0 }[1], [x[[XREG]]] + %ptr = getelementptr float* %D, i64 %offset + %tmp = extractelement <2 x float> %A, i32 1 + store float %tmp, float* %ptr + ret void +} + +define void @st1lane0_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) { +; CHECK-LABEL: st1lane0_ro_2s_float +; CHECK: str s0, [x0, x1, lsl #2] + %ptr = getelementptr float* %D, i64 %offset + %tmp = extractelement <2 x float> %A, i32 0 + store float %tmp, float* %ptr + ret void +} + define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) { ; CHECK-LABEL: st2lane_16b ; CHECK: st2.b