diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -756,6 +756,12 @@ (COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>; def : FP16Pat<(fp_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), + (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTBSH SPR:$src2), + (SSubReg_f16_reg imm:$lane)))>; +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), + (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTBSH SPR:$src2), + (SSubReg_f16_reg imm:$lane)))>; def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", @@ -772,10 +778,17 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + [/* Intentionally left blank, see patterns below */]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), + (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH SPR:$src2), + (SSubReg_f16_reg imm:$lane)))>; +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), + (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTTSH SPR:$src2), + (SSubReg_f16_reg imm:$lane)))>; + def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs DPR:$Dd), (ins SPR:$Sm), NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm", diff --git a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll --- a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll +++ b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll @@ -77,19 +77,15 @@ define <4 x half> @test_vset_lane_f16(<4 x half> %a, float %fb) nounwind { ; CHECKHARD-LABEL: test_vset_lane_f16: ; CHECKHARD: @ %bb.0: @ %entry -; CHECKHARD-NEXT: vcvtb.f16.f32 s2, s2 -; CHECKHARD-NEXT: vmov r0, s2 -; CHECKHARD-NEXT: vmov.16 d0[3], r0 +; CHECKHARD-NEXT: vcvtt.f16.f32 s1, s2 ; CHECKHARD-NEXT: bx lr ; ; CHECKSOFT-LABEL: test_vset_lane_f16: ; CHECKSOFT: @ %bb.0: @ %entry -; CHECKSOFT-NEXT: vmov s0, r2 -; CHECKSOFT-NEXT: vcvtb.f16.f32 s0, s0 -; CHECKSOFT-NEXT: vmov d16, r0, r1 -; CHECKSOFT-NEXT: vmov r2, s0 -; CHECKSOFT-NEXT: vmov.16 d16[3], r2 -; CHECKSOFT-NEXT: vmov r0, r1, d16 +; CHECKSOFT-NEXT: vmov d0, r0, r1 +; CHECKSOFT-NEXT: vmov s2, r2 +; CHECKSOFT-NEXT: vcvtt.f16.f32 s1, s2 +; CHECKSOFT-NEXT: vmov r0, r1, d0 ; CHECKSOFT-NEXT: bx lr entry: %b = fptrunc float %fb to half @@ -100,21 +96,17 @@ define <8 x half> @test_vset_laneq_f16_1(<8 x half> %a, float %fb) nounwind { ; CHECKHARD-LABEL: test_vset_laneq_f16_1: ; CHECKHARD: @ %bb.0: @ %entry -; CHECKHARD-NEXT: vcvtb.f16.f32 s4, s4 -; CHECKHARD-NEXT: vmov r0, s4 -; CHECKHARD-NEXT: vmov.16 d0[1], r0 +; CHECKHARD-NEXT: vcvtt.f16.f32 s0, s4 ; CHECKHARD-NEXT: bx lr ; ; CHECKSOFT-LABEL: test_vset_laneq_f16_1: ; CHECKSOFT: @ %bb.0: @ %entry -; CHECKSOFT-NEXT: vldr s0, [sp] -; CHECKSOFT-NEXT: vmov d17, r2, r3 -; CHECKSOFT-NEXT: vmov d16, r0, r1 -; CHECKSOFT-NEXT: vcvtb.f16.f32 s0, s0 -; CHECKSOFT-NEXT: vmov r12, s0 -; CHECKSOFT-NEXT: vmov.16 d16[1], r12 -; CHECKSOFT-NEXT: vmov r2, r3, d17 -; CHECKSOFT-NEXT: vmov r0, r1, d16 +; CHECKSOFT-NEXT: vmov d1, r2, r3 +; CHECKSOFT-NEXT: vldr s4, [sp] +; CHECKSOFT-NEXT: vmov d0, r0, r1 +; CHECKSOFT-NEXT: vcvtt.f16.f32 s0, s4 +; CHECKSOFT-NEXT: vmov r2, r3, d1 +; CHECKSOFT-NEXT: vmov r0, r1, d0 ; CHECKSOFT-NEXT: bx lr entry: %b = fptrunc float %fb to half @@ -125,21 +117,17 @@ define <8 x half> @test_vset_laneq_f16_7(<8 x half> %a, float %fb) nounwind { ; CHECKHARD-LABEL: test_vset_laneq_f16_7: ; CHECKHARD: @ %bb.0: @ %entry -; CHECKHARD-NEXT: vcvtb.f16.f32 s4, s4 -; CHECKHARD-NEXT: vmov r0, s4 -; CHECKHARD-NEXT: vmov.16 d1[3], r0 +; CHECKHARD-NEXT: vcvtt.f16.f32 s3, s4 ; CHECKHARD-NEXT: bx lr ; ; CHECKSOFT-LABEL: test_vset_laneq_f16_7: ; CHECKSOFT: @ %bb.0: @ %entry -; CHECKSOFT-NEXT: vldr s0, [sp] -; CHECKSOFT-NEXT: vmov d17, r2, r3 -; CHECKSOFT-NEXT: vmov d16, r0, r1 -; CHECKSOFT-NEXT: vcvtb.f16.f32 s0, s0 -; CHECKSOFT-NEXT: vmov r12, s0 -; CHECKSOFT-NEXT: vmov.16 d17[3], r12 -; CHECKSOFT-NEXT: vmov r0, r1, d16 -; CHECKSOFT-NEXT: vmov r2, r3, d17 +; CHECKSOFT-NEXT: vmov d1, r2, r3 +; CHECKSOFT-NEXT: vldr s4, [sp] +; CHECKSOFT-NEXT: vmov d0, r0, r1 +; CHECKSOFT-NEXT: vcvtt.f16.f32 s3, s4 +; CHECKSOFT-NEXT: vmov r0, r1, d0 +; CHECKSOFT-NEXT: vmov r2, r3, d1 ; CHECKSOFT-NEXT: bx lr entry: %b = fptrunc float %fb to half diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll --- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -968,8 +968,8 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) { ; CHECK-LABEL: frem_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vmov q5, q0 @@ -979,76 +979,61 @@ ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s20 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov s24, r0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.16 q6[0], r4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s24, s24 +; CHECK-NEXT: vcvtt.f16.f32 s24, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s21 -; CHECK-NEXT: vmov.16 q6[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s25, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s21 -; CHECK-NEXT: vmov.16 q6[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s25, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s22 -; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s26, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s22 -; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s26, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s23 -; CHECK-NEXT: vmov.16 q6[5], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s27, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s23 -; CHECK-NEXT: vmov.16 q6[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vcvtt.f16.f32 s27, s0 ; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %out = frem <8 x half> %in1, %in2 ret <8 x half> %out diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -112,70 +112,55 @@ define arm_aapcs_vfpcc <8 x half> @cos_float16_t(<8 x half> %src) { ; CHECK-LABEL: cos_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s20, s20 +; CHECK-NEXT: vcvtt.f16.f32 s20, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s21, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s21, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s22, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s22, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s23, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vcvtt.f16.f32 s23, s0 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.cos.v8f16(<8 x half> %src) ret <8 x half> %0 @@ -239,70 +224,55 @@ define arm_aapcs_vfpcc <8 x half> @sin_float16_t(<8 x half> %src) { ; CHECK-LABEL: sin_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s20, s20 +; CHECK-NEXT: vcvtt.f16.f32 s20, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s21, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s21, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s22, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s22, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s23, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vcvtt.f16.f32 s23, s0 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.sin.v8f16(<8 x half> %src) ret <8 x half> %0 @@ -366,70 +336,55 @@ define arm_aapcs_vfpcc <8 x half> @exp_float16_t(<8 x half> %src) { ; CHECK-LABEL: exp_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s20, s20 +; CHECK-NEXT: vcvtt.f16.f32 s20, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s21, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s21, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s22, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s22, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s23, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vcvtt.f16.f32 s23, s0 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.exp.v8f16(<8 x half> %src) ret <8 x half> %0 @@ -493,70 +448,55 @@ define arm_aapcs_vfpcc <8 x half> @exp2_float16_t(<8 x half> %src) { ; CHECK-LABEL: exp2_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s20, s20 +; CHECK-NEXT: vcvtt.f16.f32 s20, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s21, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s21, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s22, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s22, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s23, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vcvtt.f16.f32 s23, s0 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.exp2.v8f16(<8 x half> %src) ret <8 x half> %0 @@ -620,70 +560,55 @@ define arm_aapcs_vfpcc <8 x half> @log_float16_t(<8 x half> %src) { ; CHECK-LABEL: log_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s20, s20 +; CHECK-NEXT: vcvtt.f16.f32 s20, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s21, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s21, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s22, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s22, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s23, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vcvtt.f16.f32 s23, s0 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.log.v8f16(<8 x half> %src) ret <8 x half> %0 @@ -747,70 +672,55 @@ define arm_aapcs_vfpcc <8 x half> @log2_float16_t(<8 x half> %src) { ; CHECK-LABEL: log2_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s20, s20 +; CHECK-NEXT: vcvtt.f16.f32 s20, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s21, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s21, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s22, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s22, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s23, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vcvtt.f16.f32 s23, s0 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.log2.v8f16(<8 x half> %src) ret <8 x half> %0 @@ -874,70 +784,55 @@ define arm_aapcs_vfpcc <8 x half> @log10_float16_t(<8 x half> %src) { ; CHECK-LABEL: log10_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s20, s20 +; CHECK-NEXT: vcvtt.f16.f32 s20, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s21, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 -; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s21, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[3], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s22, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 -; CHECK-NEXT: vmov.16 q5[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s22, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s23, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 -; CHECK-NEXT: vmov.16 q5[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vcvtt.f16.f32 s23, s0 ; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.log10.v8f16(<8 x half> %src) ret <8 x half> %0 @@ -1007,8 +902,8 @@ define arm_aapcs_vfpcc <8 x half> @pow_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-LABEL: pow_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vmov q5, q0 @@ -1018,76 +913,61 @@ ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s20 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov s24, r0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.16 q6[0], r4 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s24, s24 +; CHECK-NEXT: vcvtt.f16.f32 s24, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s21 -; CHECK-NEXT: vmov.16 q6[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s25, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s21 -; CHECK-NEXT: vmov.16 q6[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s25, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s22 -; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s26, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s22 -; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtt.f16.f32 s26, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s23 -; CHECK-NEXT: vmov.16 q6[5], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s27, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s23 -; CHECK-NEXT: vmov.16 q6[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vcvtt.f16.f32 s27, s0 ; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.pow.v8f16(<8 x half> %src1, <8 x half> %src2) ret <8 x half> %0 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -1225,23 +1225,15 @@ ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: vcmp.f32 s0, #0 -; CHECK-LE-NEXT: vmov r1, s4 -; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s1 -; CHECK-LE-NEXT: vmov r2, s4 -; CHECK-LE-NEXT: vmov.16 q1[0], r1 -; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s2 -; CHECK-LE-NEXT: vmov.16 q1[1], r2 -; CHECK-LE-NEXT: vmov r1, s8 -; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s3 -; CHECK-LE-NEXT: vmov.16 q1[2], r1 -; CHECK-LE-NEXT: vmov r1, s8 -; CHECK-LE-NEXT: vmov.16 q1[3], r1 +; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 ; CHECK-LE-NEXT: mov.w r1, #0 -; CHECK-LE-NEXT: vcmp.f32 s1, #0 +; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 ; CHECK-LE-NEXT: it gt ; CHECK-LE-NEXT: movgt r1, #1 ; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: vcmp.f32 s1, #0 ; CHECK-LE-NEXT: cset r1, ne ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: and r1, r1, #1 @@ -1314,25 +1306,18 @@ ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: vcmp.f32 s4, #0 -; CHECK-BE-NEXT: vmov r1, s0 -; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s5 -; CHECK-BE-NEXT: vmov r2, s0 -; CHECK-BE-NEXT: vmov.16 q0[0], r1 -; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s6 -; CHECK-BE-NEXT: vmov.16 q0[1], r2 -; CHECK-BE-NEXT: vmov r1, s8 -; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s7 -; CHECK-BE-NEXT: vmov.16 q0[2], r1 -; CHECK-BE-NEXT: vmov r1, s8 -; CHECK-BE-NEXT: vmov.16 q0[3], r1 +; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 ; CHECK-BE-NEXT: vcmp.f32 s5, #0 +; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 ; CHECK-BE-NEXT: it gt ; CHECK-BE-NEXT: movgt r1, #1 ; CHECK-BE-NEXT: cmp r1, #0 +; CHECK-BE-NEXT: mov.w r2, #0 ; CHECK-BE-NEXT: cset r1, ne ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-BE-NEXT: and r1, r1, #1 @@ -1349,7 +1334,6 @@ ; CHECK-BE-NEXT: and r3, r3, #1 ; CHECK-BE-NEXT: vcmp.f32 s7, #0 ; CHECK-BE-NEXT: rsb.w r3, r3, #0 -; CHECK-BE-NEXT: mov.w r2, #0 ; CHECK-BE-NEXT: bfi r1, r3, #1, #1 ; CHECK-BE-NEXT: mov.w r3, #0 ; CHECK-BE-NEXT: it gt @@ -1413,23 +1397,15 @@ ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: vcmp.f32 s0, #0 -; CHECK-LE-NEXT: vmov r1, s4 -; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s1 -; CHECK-LE-NEXT: vmov r2, s4 -; CHECK-LE-NEXT: vmov.16 q1[0], r1 -; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s2 -; CHECK-LE-NEXT: vmov.16 q1[1], r2 -; CHECK-LE-NEXT: vmov r1, s8 -; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s3 -; CHECK-LE-NEXT: vmov.16 q1[2], r1 -; CHECK-LE-NEXT: vmov r1, s8 -; CHECK-LE-NEXT: vmov.16 q1[3], r1 +; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 ; CHECK-LE-NEXT: mov.w r1, #0 -; CHECK-LE-NEXT: vcmp.f32 s1, #0 +; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 ; CHECK-LE-NEXT: it gt ; CHECK-LE-NEXT: movgt r1, #1 ; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: vcmp.f32 s1, #0 ; CHECK-LE-NEXT: cset r1, ne ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: and r1, r1, #1 @@ -1502,25 +1478,18 @@ ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: vcmp.f32 s4, #0 -; CHECK-BE-NEXT: vmov r1, s0 -; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s5 -; CHECK-BE-NEXT: vmov r2, s0 -; CHECK-BE-NEXT: vmov.16 q0[0], r1 -; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s6 -; CHECK-BE-NEXT: vmov.16 q0[1], r2 -; CHECK-BE-NEXT: vmov r1, s8 -; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s7 -; CHECK-BE-NEXT: vmov.16 q0[2], r1 -; CHECK-BE-NEXT: vmov r1, s8 -; CHECK-BE-NEXT: vmov.16 q0[3], r1 +; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 ; CHECK-BE-NEXT: vcmp.f32 s5, #0 +; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 ; CHECK-BE-NEXT: it gt ; CHECK-BE-NEXT: movgt r1, #1 ; CHECK-BE-NEXT: cmp r1, #0 +; CHECK-BE-NEXT: mov.w r2, #0 ; CHECK-BE-NEXT: cset r1, ne ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-BE-NEXT: and r1, r1, #1 @@ -1537,7 +1506,6 @@ ; CHECK-BE-NEXT: and r3, r3, #1 ; CHECK-BE-NEXT: vcmp.f32 s7, #0 ; CHECK-BE-NEXT: rsb.w r3, r3, #0 -; CHECK-BE-NEXT: mov.w r2, #0 ; CHECK-BE-NEXT: bfi r1, r3, #1, #1 ; CHECK-BE-NEXT: mov.w r3, #0 ; CHECK-BE-NEXT: it gt @@ -1601,23 +1569,15 @@ ; CHECK-LE-NEXT: sub sp, #20 ; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: vcmp.f32 s0, #0 -; CHECK-LE-NEXT: vmov r1, s4 -; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s1 -; CHECK-LE-NEXT: vmov r2, s4 -; CHECK-LE-NEXT: vmov.16 q1[0], r1 -; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s2 -; CHECK-LE-NEXT: vmov.16 q1[1], r2 -; CHECK-LE-NEXT: vmov r1, s8 -; CHECK-LE-NEXT: vcvtb.f16.f32 s8, s3 -; CHECK-LE-NEXT: vmov.16 q1[2], r1 -; CHECK-LE-NEXT: vmov r1, s8 -; CHECK-LE-NEXT: vmov.16 q1[3], r1 +; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 ; CHECK-LE-NEXT: mov.w r1, #0 -; CHECK-LE-NEXT: vcmp.f32 s1, #0 +; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 ; CHECK-LE-NEXT: it gt ; CHECK-LE-NEXT: movgt r1, #1 ; CHECK-LE-NEXT: cmp r1, #0 +; CHECK-LE-NEXT: vcmp.f32 s1, #0 ; CHECK-LE-NEXT: cset r1, ne ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: and r1, r1, #1 @@ -1698,25 +1658,18 @@ ; CHECK-BE-NEXT: .pad #20 ; CHECK-BE-NEXT: sub sp, #20 ; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: vcmp.f32 s4, #0 -; CHECK-BE-NEXT: vmov r1, s0 -; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s5 -; CHECK-BE-NEXT: vmov r2, s0 -; CHECK-BE-NEXT: vmov.16 q0[0], r1 -; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s6 -; CHECK-BE-NEXT: vmov.16 q0[1], r2 -; CHECK-BE-NEXT: vmov r1, s8 -; CHECK-BE-NEXT: vcvtb.f16.f32 s8, s7 -; CHECK-BE-NEXT: vmov.16 q0[2], r1 -; CHECK-BE-NEXT: vmov r1, s8 -; CHECK-BE-NEXT: vmov.16 q0[3], r1 +; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-BE-NEXT: mov.w r1, #0 +; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 ; CHECK-BE-NEXT: vcmp.f32 s5, #0 +; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 ; CHECK-BE-NEXT: it gt ; CHECK-BE-NEXT: movgt r1, #1 ; CHECK-BE-NEXT: cmp r1, #0 +; CHECK-BE-NEXT: mov.w r2, #0 ; CHECK-BE-NEXT: cset r1, ne ; CHECK-BE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-BE-NEXT: and r1, r1, #1 @@ -1733,7 +1686,6 @@ ; CHECK-BE-NEXT: and r3, r3, #1 ; CHECK-BE-NEXT: vcmp.f32 s7, #0 ; CHECK-BE-NEXT: rsb.w r3, r3, #0 -; CHECK-BE-NEXT: mov.w r2, #0 ; CHECK-BE-NEXT: bfi r1, r3, #1, #1 ; CHECK-BE-NEXT: mov.w r3, #0 ; CHECK-BE-NEXT: it gt diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -38,17 +38,9 @@ ; CHECK-LABEL: fptrunc_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vcvtb.f16.f32 s4, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vcvtb.f16.f32 s4, s1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s8, s2 -; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s0, s3 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vcvtt.f16.f32 s4, s1 +; CHECK-NEXT: vcvtb.f16.f32 s5, s2 +; CHECK-NEXT: vcvtt.f16.f32 s5, s3 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -61,29 +53,13 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s9 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vcvtb.f16.f32 s12, s10 -; CHECK-NEXT: vmov.16 q0[1], r1 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vcvtb.f16.f32 s8, s11 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s4 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s5 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s8, s6 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vcvtb.f16.f32 s4, s7 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vcvtt.f16.f32 s0, s9 +; CHECK-NEXT: vcvtb.f16.f32 s1, s10 +; CHECK-NEXT: vcvtt.f16.f32 s1, s11 +; CHECK-NEXT: vcvtb.f16.f32 s2, s4 +; CHECK-NEXT: vcvtt.f16.f32 s2, s5 +; CHECK-NEXT: vcvtb.f16.f32 s3, s6 +; CHECK-NEXT: vcvtt.f16.f32 s3, s7 ; CHECK-NEXT: bx lr entry: %out = fptrunc <8 x float> %src1 to <8 x half>