diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll --- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll +++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll @@ -486,3 +486,675 @@ %1 = tail call half @llvm.fma.f16(half %b, half %extract, half %a) ret half %1 } + +define dso_local <4 x half> @t_vfma_lane_f16_lo(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfma_lane_f16_lo: +; CHECK: .Lt_vfma_lane_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmla v17.4h, v16.4h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + +entry: + %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %fmla3 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %lane1, <4 x half> %a) + ret <4 x half> %fmla3 +} + +define dso_local <8 x half> @t_vfmaq_lane_f16_lo(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmaq_lane_f16_lo: +; CHECK: .Lt_vfmaq_lane_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmla v17.8h, v16.8h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + +entry: + %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %fmla3 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %lane1, <8 x half> %a) + ret <8 x half> %fmla3 +} + +define dso_local <4 x half> @t_vfma_laneq_f16_lo(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfma_laneq_f16_lo: +; CHECK: .Lt_vfma_laneq_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmla v17.4h, v16.4h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + +entry: + %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %lane1, <4 x half> %b, <4 x half> %a) + ret <4 x half> %0 +} + +define dso_local <8 x half> @t_vfmaq_laneq_f16_lo(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmaq_laneq_f16_lo: +; CHECK: .Lt_vfmaq_laneq_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmla v17.8h, v16.8h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + +entry: + %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %lane1, <8 x half> %b, <8 x half> %a) + ret <8 x half> %0 +} + +define dso_local <4 x half> @t_vfma_n_f16_lo(<4 x half> %a, <4 x half> %b, half %c) { +; CHECK-LABEL: t_vfma_n_f16_lo: +; CHECK: .Lt_vfma_n_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmla v17.4h, v16.4h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + +entry: + %vecinit = insertelement <4 x half> undef, half %c, i32 0 + %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %vecinit3, <4 x half> %a) #4 + ret <4 x half> %0 +} + +define dso_local <8 x half> @t_vfmaq_n_f16_lo(<8 x half> %a, <8 x half> %b, half %c) { +; CHECK-LABEL: t_vfmaq_n_f16_lo: +; CHECK: .Lt_vfmaq_n_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmla v17.8h, v16.8h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + +entry: + %vecinit = insertelement <8 x half> undef, half %c, i32 0 + %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %vecinit7, <8 x half> %a) #4 + ret <8 x half> %0 +} + +define dso_local half @t_vfmah_lane_f16_lo(half %a, half %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmah_lane_f16_lo: +; CHECK: .Lt_vfmah_lane_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmla h17, h16, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x half> %c, i32 0 + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a) + ret half %0 +} + +define dso_local half @t_vfmah_laneq_f16_lo(half %a, half %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmah_laneq_f16_lo: +; CHECK: .Lt_vfmah_laneq_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmla h17, h16, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret +entry: + %extract = extractelement <8 x half> %c, i32 0 + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a) + ret half %0 +} + +define dso_local <4 x half> @t_vfms_lane_f16_lo(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfms_lane_f16_lo: +; CHECK: .Lt_vfms_lane_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmls v17.4h, v16.4h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + + +entry: + %sub = fsub <4 x half> , %b + %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %fmla3 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub, <4 x half> %lane1, <4 x half> %a) + ret <4 x half> %fmla3 +} + +define dso_local <8 x half> @t_vfmsq_lane_f16_lo(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmsq_lane_f16_lo: +; CHECK: .Lt_vfmsq_lane_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmls v17.8h, v16.8h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + + +entry: + %sub = fsub <8 x half> , %b + %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %fmla3 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub, <8 x half> %lane1, <8 x half> %a) + ret <8 x half> %fmla3 +} + +define dso_local <4 x half> @t_vfms_laneq_f16_lo(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfms_laneq_f16_lo: +; CHECK: .Lt_vfms_laneq_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmls v17.4h, v16.4h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + +entry: + %sub = fsub <4 x half> , %b + %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %lane1, <4 x half> %sub, <4 x half> %a) + ret <4 x half> %0 +} + +define dso_local <8 x half> @t_vfmsq_laneq_f16_lo(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmsq_laneq_f16_lo: +; CHECK: .Lt_vfmsq_laneq_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmls v17.8h, v16.8h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + +entry: + %sub = fsub <8 x half> , %b + %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %lane1, <8 x half> %sub, <8 x half> %a) + ret <8 x half> %0 +} + +define dso_local <4 x half> @t_vfms_n_f16_lo(<4 x half> %a, <4 x half> %b, half %c) { +; CHECK-LABEL: t_vfms_n_f16_lo: +; CHECK: .Lt_vfms_n_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmls v17.4h, v16.4h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + + +entry: + %sub = fsub <4 x half> , %b + %vecinit = insertelement <4 x half> undef, half %c, i32 0 + %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> undef, <4 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %0 = tail call <4 x half> @llvm.fma.v4f16(<4 x half> %sub, <4 x half> %vecinit3, <4 x half> %a) #4 + ret <4 x half> %0 +} + +define dso_local <8 x half> @t_vfmsq_n_f16_lo(<8 x half> %a, <8 x half> %b, half %c) { +; CHECK-LABEL: t_vfmsq_n_f16_lo: +; CHECK: .Lt_vfmsq_n_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmls v17.8h, v16.8h, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret + + +entry: + %sub = fsub <8 x half> , %b + %vecinit = insertelement <8 x half> undef, half %c, i32 0 + %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> undef, <8 x i32> zeroinitializer + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %sub, <8 x half> %vecinit7, <8 x half> %a) #4 + ret <8 x half> %0 +} + +define dso_local half @t_vfmsh_lane_f16_lo(half %a, half %b, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmsh_lane_f16_lo: +; CHECK: .Lt_vfmsh_lane_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmls h17, h16, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret +entry: + %0 = fsub half 0xH8000, %b + %extract = extractelement <4 x half> %c, i32 0 + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a) + ret half %1 +} + +define dso_local half @t_vfmsh_laneq_f16_lo(half %a, half %b, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vfmsh_laneq_f16_lo: +; CHECK: .Lt_vfmsh_laneq_f16_lo$local: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #80 // =80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset b8, -8 +; CHECK-NEXT: .cfi_offset b9, -16 +; CHECK-NEXT: .cfi_offset b10, -24 +; CHECK-NEXT: .cfi_offset b11, -32 +; CHECK-NEXT: .cfi_offset b12, -40 +; CHECK-NEXT: .cfi_offset b13, -48 +; CHECK-NEXT: .cfi_offset b14, -56 +; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov v16.16b, v1.16b +; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: fmls h17, h16, v0.h[0] +; CHECK-NEXT: mov v0.16b, v17.16b +; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ret +entry: + %0 = fsub half 0xH8000, %b + %extract = extractelement <8 x half> %c, i32 0 + tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() + %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a) + ret half %1 +}