Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -2248,13 +2248,15 @@ } let AddedComplexity = 19 in { + defm : VecROStoreLane0Pat; defm : VecROStoreLane0Pat; - defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; defm : VecROStoreLane0Pat; - defm : VecROStoreLane0Pat; - defm : VecROStoreLane0Pat; - defm : VecROStoreLane0Pat; - defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; + defm : VecROStoreLane0Pat; } //--- @@ -2288,8 +2290,16 @@ (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; -// Match all store 64 bits width whose type is compatible with FPR64 let AddedComplexity = 10 in { + +// Match all store 64 bits width whose type is compatible with FPR64 +def : Pat<(store (v1i64 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(store (v1f64 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + let Predicates = [IsLE] in { // We must use ST1 to store vectors in big-endian. def : Pat<(store (v2f32 FPR64:$Rt), @@ -2308,14 +2318,12 @@ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; } -def : Pat<(store (v1f64 FPR64:$Rt), - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), - (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; -def : Pat<(store (v1i64 FPR64:$Rt), - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), - (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; // Match all store 128 bits width whose type is compatible with FPR128 +def : Pat<(store (f128 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + let Predicates = [IsLE] in { // We must use ST1 to store vectors in big-endian. def : Pat<(store (v4f32 FPR128:$Rt), @@ -2340,9 +2348,6 @@ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; } -def : Pat<(store (f128 FPR128:$Rt), - (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), - (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; // truncstore i64 def : Pat<(truncstorei32 GPR64:$Rt, @@ -2356,6 +2361,29 @@ } // AddedComplexity = 10 +// Match stores from lane 0 to the appropriate subreg's store. +multiclass VecStoreLane0Pat { + def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), 0)), + (UIAddrMode GPR64sp:$Rn, IndexType:$offset)), + (STR (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, IndexType:$offset)>; +} + +let AddedComplexity = 19 in { + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; + defm : VecStoreLane0Pat; +} + //--- // (unscaled immediate) defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur", @@ -2387,6 +2415,11 @@ (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>; // Match all store 64 bits width whose type is compatible with FPR64 +def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + let Predicates = [IsLE] in { // We must use ST1 to store vectors in big-endian. def : Pat<(store (v2f32 FPR64:$Rt), @@ -2405,12 +2438,11 @@ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; } -def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), - (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), - (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; // Match all store 128 bits width whose type is compatible with FPR128 +def : Pat<(store (f128 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + let Predicates = [IsLE] in { // We must use ST1 to store vectors in big-endian. def : Pat<(store (v4f32 FPR128:$Rt), @@ -4151,12 +4183,18 @@ (SUBREG_TO_REG (i32 0), (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; +def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))), (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 FPR32:$Rn), ssub))>; def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))), (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 FPR32:$Rn), ssub))>; + def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))), (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (i64 FPR64:$Rn), dsub))>; @@ -4170,6 +4208,7 @@ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; + def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))), (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>; Index: llvm/test/CodeGen/AArch64/arm64-neon-copy.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -925,7 +925,7 @@ ; CHECK-LABEL: test_extracts_inserts_varidx_insert: ; CHECK: and [[MASKED_IDX:x[0-9]+]], x0, #0x3 ; CHECK: bfi x9, [[MASKED_IDX]], #1, #2 -; CHECK: st1 { v0.h }[0], [x9] +; CHECK: str h0, [x9] ; CHECK-DAG: ldr d[[R:[0-9]+]] ; CHECK-DAG: mov v[[R]].h[1], v0.h[1] ; CHECK-DAG: mov v[[R]].h[2], v0.h[2] Index: llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll +++ llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll @@ -391,6 +391,15 @@ ret void } +define void @test_vst1q_lane0_s16(i16* %a, <8 x i16> %b) { +; CHECK-LABEL: test_vst1q_lane0_s16: +; CHECK: str {{h[0-9]+}}, [x0] +entry: + %0 = extractelement <8 x i16> %b, i32 0 + store i16 %0, i16* %a, align 2 + ret void +} + define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) { ; CHECK-LABEL: test_vst1q_lane_s32: ; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0] @@ -400,6 +409,15 @@ ret void } +define void @test_vst1q_lane0_s32(i32* %a, <4 x i32> %b) { +; CHECK-LABEL: test_vst1q_lane0_s32: +; CHECK: str {{s[0-9]+}}, [x0] +entry: + %0 = extractelement <4 x i32> %b, i32 0 + store i32 %0, i32* %a, align 4 + ret void +} + define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) { ; CHECK-LABEL: test_vst1q_lane_s64: ; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0] @@ -409,6 +427,15 @@ ret void } +define void @test_vst1q_lane0_s64(i64* %a, <2 x i64> %b) { +; CHECK-LABEL: test_vst1q_lane0_s64: +; CHECK: str {{d[0-9]+}}, [x0] +entry: + %0 = extractelement <2 x i64> %b, i32 0 + store i64 %0, i64* %a, align 8 + ret void +} + define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) { ; CHECK-LABEL: test_vst1q_lane_f32: ; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0] @@ -418,6 +445,15 @@ ret void } +define void @test_vst1q_lane0_f32(float* %a, <4 x float> %b) { +; CHECK-LABEL: test_vst1q_lane0_f32: +; CHECK: str {{s[0-9]+}}, [x0] +entry: + %0 = extractelement <4 x float> %b, i32 0 + store float %0, float* %a, align 4 + ret void +} + define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) { ; CHECK-LABEL: test_vst1q_lane_f64: ; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0] @@ -427,6 +463,15 @@ ret void } +define void @test_vst1q_lane0_f64(double* %a, <2 x double> %b) { +; CHECK-LABEL: test_vst1q_lane0_f64: +; CHECK: str {{d[0-9]+}}, [x0] +entry: + %0 = extractelement <2 x double> %b, i32 0 + store double %0, double* %a, align 8 + ret void +} + define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) { ; CHECK-LABEL: test_vst1_lane_s8: ; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0] @@ -445,6 +490,15 @@ ret void } +define void @test_vst1_lane0_s16(i16* %a, <4 x i16> %b) { +; CHECK-LABEL: test_vst1_lane0_s16: +; CHECK: str {{h[0-9]+}}, [x0] +entry: + %0 = extractelement <4 x i16> %b, i32 0 + store i16 %0, i16* %a, align 2 + ret void +} + define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) { ; CHECK-LABEL: test_vst1_lane_s32: ; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0] @@ -454,9 +508,18 @@ ret void } +define void @test_vst1_lane0_s32(i32* %a, <2 x i32> %b) { +; CHECK-LABEL: test_vst1_lane0_s32: +; CHECK: str {{s[0-9]+}}, [x0] +entry: + %0 = extractelement <2 x i32> %b, i32 0 + store i32 %0, i32* %a, align 4 + ret void +} + define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) { ; CHECK-LABEL: test_vst1_lane_s64: -; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0] +; CHECK: str {{d[0-9]+}}, [x0] entry: %0 = extractelement <1 x i64> %b, i32 0 store i64 %0, i64* %a, align 8 @@ -472,6 +535,15 @@ ret void } +define void @test_vst1_lane0_f32(float* %a, <2 x float> %b) { +; CHECK-LABEL: test_vst1_lane0_f32: +; CHECK: str {{s[0-9]+}}, [x0] +entry: + %0 = extractelement <2 x float> %b, i32 0 + store float %0, float* %a, align 4 + ret void +} + define void @test_vst1_lane_f64(double* %a, <1 x double> %b) { ; CHECK-LABEL: test_vst1_lane_f64: ; CHECK: str {{d[0-9]+}}, [x0] Index: llvm/test/CodeGen/AArch64/arm64-st1.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-st1.ll +++ llvm/test/CodeGen/AArch64/arm64-st1.ll @@ -38,6 +38,14 @@ ret void } +define void @st1lane0_8h(<8 x i16> %A, i16* %D) { +; CHECK-LABEL: st1lane0_8h +; CHECK: str + %tmp = extractelement <8 x i16> %A, i32 0 + store i16 %tmp, i16* %D + ret void +} + define void @st1lane_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) { ; CHECK-LABEL: st1lane_ro_8h ; CHECK: add x[[XREG:[0-9]+]], x0, x1 @@ -65,6 +73,14 @@ ret void } +define void @st1lane0_4s(<4 x i32> %A, i32* %D) { +; CHECK-LABEL: st1lane0_4s +; CHECK: str + %tmp = extractelement <4 x i32> %A, i32 0 + store i32 %tmp, i32* %D + ret void +} + define void @st1lane_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) { ; CHECK-LABEL: st1lane_ro_4s ; CHECK: add x[[XREG:[0-9]+]], x0, x1 @@ -92,6 +108,14 @@ ret void } +define void @st1lane0_4s_float(<4 x float> %A, float* %D) { +; CHECK-LABEL: st1lane0_4s_float +; CHECK: str + %tmp = extractelement <4 x float> %A, i32 0 + store float %tmp, float* %D + ret void +} + define void @st1lane_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) { ; CHECK-LABEL: st1lane_ro_4s_float ; CHECK: add x[[XREG:[0-9]+]], x0, x1 @@ -119,6 +143,14 @@ ret void } +define void @st1lane0_2d(<2 x i64> %A, i64* %D) { +; CHECK-LABEL: st1lane0_2d +; CHECK: str + %tmp = extractelement <2 x i64> %A, i32 0 + store i64 %tmp, i64* %D + ret void +} + define void @st1lane_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) { ; CHECK-LABEL: st1lane_ro_2d ; CHECK: add x[[XREG:[0-9]+]], x0, x1 @@ -146,6 +178,14 @@ ret void } +define void @st1lane0_2d_double(<2 x double> %A, double* %D) { +; CHECK-LABEL: st1lane0_2d_double +; CHECK: str + %tmp = extractelement <2 x double> %A, i32 0 + store double %tmp, double* %D + ret void +} + define void @st1lane_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) { ; CHECK-LABEL: st1lane_ro_2d_double ; CHECK: add x[[XREG:[0-9]+]], x0, x1 @@ -201,6 +241,14 @@ ret void } +define void @st1lane0_4h(<4 x i16> %A, i16* %D) { +; CHECK-LABEL: st1lane0_4h +; CHECK: str + %tmp = extractelement <4 x i16> %A, i32 0 + store i16 %tmp, i16* %D + ret void +} + define void @st1lane_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) { ; CHECK-LABEL: st1lane_ro_4h ; CHECK: add x[[XREG:[0-9]+]], x0, x1 @@ -228,6 +276,14 @@ ret void } +define void @st1lane0_2s(<2 x i32> %A, i32* %D) { +; CHECK-LABEL: st1lane0_2s +; CHECK: str + %tmp = extractelement <2 x i32> %A, i32 0 + store i32 %tmp, i32* %D + ret void +} + define void @st1lane_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) { ; CHECK-LABEL: st1lane_ro_2s ; CHECK: add x[[XREG:[0-9]+]], x0, x1 @@ -255,6 +311,14 @@ ret void } +define void @st1lane0_2s_float(<2 x float> %A, float* %D) { +; CHECK-LABEL: st1lane0_2s_float +; CHECK: str + %tmp = extractelement <2 x float> %A, i32 0 + store float %tmp, float* %D + ret void +} + define void @st1lane_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) { ; CHECK-LABEL: st1lane_ro_2s_float ; CHECK: add x[[XREG:[0-9]+]], x0, x1 Index: llvm/test/CodeGen/AArch64/fp16-vector-load-store.ll =================================================================== --- llvm/test/CodeGen/AArch64/fp16-vector-load-store.ll +++ llvm/test/CodeGen/AArch64/fp16-vector-load-store.ll @@ -88,6 +88,15 @@ ret void } +define void @store_lane0_64(half* nocapture %a, <4 x half> %b) #1 { +; CHECK-LABEL: store_lane0_64: +; CHECK: str h0, [x0] +entry: + %0 = extractelement <4 x half> %b, i32 0 + store half %0, half* %a, align 2 + ret void +} + ; Store from one lane of v8f16 define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 { ; CHECK-LABEL: store_lane_128: @@ -98,6 +107,15 @@ ret void } +define void @store_lane0_128(half* nocapture %a, <8 x half> %b) #1 { +; CHECK-LABEL: store_lane0_128: +; CHECK: str h0, [x0] +entry: + %0 = extractelement <8 x half> %b, i32 0 + store half %0, half* %a, align 2 + ret void +} + ; NEON intrinsics - (de-)interleaving loads and stores declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*) declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)