Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1763,12 +1763,14 @@ default: llvm_unreachable("unhandled vld type"); // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4f16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; case MVT::v1i64: OpcodeIndex = 3; break; // Quad-register operations: case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8f16: case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; @@ -2070,10 +2072,12 @@ default: llvm_unreachable("unhandled vld/vst lane type"); // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4f16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; // Quad-register operations: + case MVT::v8f16: case MVT::v8i16: OpcodeIndex = 0; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 1; break; Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -5734,16 +5734,16 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, bool is128Bits, - NEONModImmType type) { + NEONModImmType type, bool FP16 = false) { unsigned OpCmode, Imm; // SplatBitSize is set to the smallest size that splats the vector, so a // zero vector will always have SplatBitSize == 8. However, NEON modified // immediate instructions others than VMOV do not support the 8-bit encoding // of a zero vector, and the default encoding of zero is supposed to be the - // 32-bit version. + // 32-bit version, and the 16-bit version for f16 vectors. if (SplatBits == 0) - SplatBitSize = 32; + SplatBitSize = FP16 ? 16 : 32; switch (SplatBitSize) { case 8: @@ -6384,10 +6384,11 @@ if (SplatBitSize <= 64) { // Check if an immediate VMOV works. EVT VmovVT; + const bool FP16 = (VT == MVT::v4f16 || VT == MVT::v8f16); SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), - VMOVModImm); + VMOVModImm, FP16); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); @@ -6465,9 +6466,10 @@ unsigned EltSize = VT.getScalarSizeInBits(); - // Use VDUP for non-constant splats. For f32 constant splats, reduce to - // i32 and try again. + // Use VDUP for non-constant splats. For f32 and f16 constant splats, reduce to + // i32 and i16 and try again. if (hasDominantValue && EltSize <= 32) { + EVT IntEltType = (EltSize == 32 ? MVT::i32 : MVT::i16); if (!isConstant) { SDValue N; @@ -6514,9 +6516,9 @@ if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IntEltType, Op.getOperand(i))); - EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IntEltType, NumElts); SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); Val = LowerBUILD_VECTOR(Val, DAG, ST); if (Val.getNode()) Index: lib/Target/ARM/ARMInstrNEON.td =================================================================== --- lib/Target/ARM/ARMInstrNEON.td +++ lib/Target/ARM/ARMInstrNEON.td @@ -7142,6 +7142,7 @@ } def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; let Predicates = [IsLE] in { + def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (v4f16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (v4i16 DPR:$src)>; @@ -7185,6 +7186,7 @@ } def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; let Predicates = [IsLE] in { + def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; @@ -7223,6 +7225,7 @@ def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>; @@ -7256,6 +7259,7 @@ def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>; Index: test/CodeGen/ARM/fp16-reduction.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/fp16-reduction.ll @@ -0,0 +1,227 @@ +; RUN: llc < %s | FileCheck %s +; RUN: llc -mtriple armeb-unknown < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv8.2a-arm-unknown-eabihf" + +define dso_local float @vec8_zero_init(half* nocapture readonly %V, i32 %N) local_unnamed_addr #0 { +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %min.iters.check = icmp ult i32 %N, 8 + br i1 %min.iters.check, label %for.body.preheader15, label %vector.ph + +for.body.preheader15: + %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %Tmp.07.ph = phi half [ 0xH0000, %for.body.preheader ], [ 0xH8000, %middle.block ] + br label %for.body + +vector.ph: + %n.vec = and i32 %N, -8 + br label %vector.body + +vector.body: + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <8 x half> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ] + +; CHECK-LABEL: vec8_zero_init: +; CHECK: vmov.i16 q8, #0x0 + + %0 = getelementptr inbounds half, half* %V, i32 %index + %1 = bitcast half* %0 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %1, align 2 + %2 = fadd fast <8 x half> %wide.load, %vec.phi + %index.next = add i32 %index, 8 + %3 = icmp eq i32 %index.next, %n.vec + br i1 %3, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf = shufflevector <8 x half> %2, <8 x half> undef, <8 x i32> + %bin.rdx = fadd fast <8 x half> %2, %rdx.shuf + %rdx.shuf11 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> + %bin.rdx12 = fadd fast <8 x half> %bin.rdx, %rdx.shuf11 + %rdx.shuf13 = shufflevector <8 x half> %bin.rdx12, <8 x half> undef, <8 x i32> + %bin.rdx14 = fadd fast <8 x half> %bin.rdx12, %rdx.shuf13 + +; TODO: suport v8f16 extractelement +; %4 = extractelement <8 x half> %bin.rdx14, i32 0 + %cmp.n = icmp eq i32 %n.vec, %N + br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %for.body.preheader15 + +for.cond.cleanup.loopexit: +; %add.lcssa = phi half [ %4, %middle.block ], [ %add, %for.body ] + %add.lcssa = phi half [ 0.000000e+00, %middle.block ], [ %add, %for.body ] + %phitmp = bitcast half %add.lcssa to i16 + %phitmp9 = zext i16 %phitmp to i32 + %phitmp10 = bitcast i32 %phitmp9 to float + br label %for.cond.cleanup + +for.cond.cleanup: + %Tmp.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp10, %for.cond.cleanup.loopexit ] + ret float %Tmp.0.lcssa + +for.body: + %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader15 ] + %Tmp.07 = phi half [ %add, %for.body ], [ %Tmp.07.ph, %for.body.preheader15 ] + %arrayidx = getelementptr inbounds half, half* %V, i32 %i.08 + %V5 = load half, half* %arrayidx, align 2 + %add = fadd fast half %V5, %Tmp.07 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + + +define dso_local float @vec8_one_init(half* nocapture readonly %V, i32 %N) local_unnamed_addr #0 { +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %min.iters.check = icmp ult i32 %N, 8 + br i1 %min.iters.check, label %for.body.preheader15, label %vector.ph + +for.body.preheader15: + %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %Tmp.07.ph = phi half [ 0xH0000, %for.body.preheader ], [ 0xH8000, %middle.block ] + br label %for.body + +vector.ph: + %n.vec = and i32 %N, -8 + br label %vector.body + +vector.body: + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <8 x half> [ , %vector.ph ], [ %2, %vector.body ] + +; CHECK-LABEL: vec8_one_init: +; CHECK: adr r2, .LCPI1_2 +; CHECK: .LCPI1_2: +; CHECK-NEXT: .short 15360 @ half 1 +; CHECK-NEXT: .short 0 @ half 0 +; CHECK-NEXT: .short 0 @ half 0 +; CHECK-NEXT: .short 0 @ half 0 +; CHECK-NEXT: .short 0 @ half 0 +; CHECK-NEXT: .short 0 @ half 0 +; CHECK-NEXT: .short 0 @ half 0 +; CHECK-NEXT: .short 0 @ half 0 + + + %0 = getelementptr inbounds half, half* %V, i32 %index + %1 = bitcast half* %0 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %1, align 2 + %2 = fadd fast <8 x half> %wide.load, %vec.phi + %index.next = add i32 %index, 8 + %3 = icmp eq i32 %index.next, %n.vec + br i1 %3, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf = shufflevector <8 x half> %2, <8 x half> undef, <8 x i32> + %bin.rdx = fadd fast <8 x half> %2, %rdx.shuf + %rdx.shuf11 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> + %bin.rdx12 = fadd fast <8 x half> %bin.rdx, %rdx.shuf11 + %rdx.shuf13 = shufflevector <8 x half> %bin.rdx12, <8 x half> undef, <8 x i32> + %bin.rdx14 = fadd fast <8 x half> %bin.rdx12, %rdx.shuf13 + +; TODO: suport v8f16 extractelement +; %4 = extractelement <8 x half> %bin.rdx14, i32 0 + %cmp.n = icmp eq i32 %n.vec, %N + br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %for.body.preheader15 + +for.cond.cleanup.loopexit: +; %add.lcssa = phi half [ %4, %middle.block ], [ %add, %for.body ] + %add.lcssa = phi half [ 0.000000e+00, %middle.block ], [ %add, %for.body ] + %phitmp = bitcast half %add.lcssa to i16 + %phitmp9 = zext i16 %phitmp to i32 + %phitmp10 = bitcast i32 %phitmp9 to float + br label %for.cond.cleanup + +for.cond.cleanup: + %Tmp.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp10, %for.cond.cleanup.loopexit ] + ret float %Tmp.0.lcssa + +for.body: + %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader15 ] + %Tmp.07 = phi half [ %add, %for.body ], [ %Tmp.07.ph, %for.body.preheader15 ] + %arrayidx = getelementptr inbounds half, half* %V, i32 %i.08 + %V5 = load half, half* %arrayidx, align 2 + %add = fadd fast half %V5, %Tmp.07 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + + + +define dso_local float @vec4_zero_init(half* nocapture readonly %V, i32 %N) local_unnamed_addr #0 { +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %min.iters.check = icmp ult i32 %N, 8 + br i1 %min.iters.check, label %for.body.preheader15, label %vector.ph + +for.body.preheader15: + %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %Tmp.07.ph = phi half [ 0xH0000, %for.body.preheader ], [ 0xH8000, %middle.block ] + br label %for.body + +vector.ph: + %n.vec = and i32 %N, -8 + br label %vector.body + +vector.body: + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x half> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ] + +; CHECK-LABEL: vec4_zero_init: +; CHECK: vmov.i16 d16, #0x0 + + %0 = getelementptr inbounds half, half* %V, i32 %index + %1 = bitcast half* %0 to <4 x half>* + %wide.load = load <4 x half>, <4 x half>* %1, align 2 + %2 = fadd fast <4 x half> %wide.load, %vec.phi + %index.next = add i32 %index, 8 + %3 = icmp eq i32 %index.next, %n.vec + br i1 %3, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf = shufflevector <4 x half> %2, <4 x half> undef, <4 x i32> + %bin.rdx = fadd fast <4 x half> %2, %rdx.shuf + %rdx.shuf11 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> + %bin.rdx12 = fadd fast <4 x half> %bin.rdx, %rdx.shuf11 + %rdx.shuf13 = shufflevector <4 x half> %bin.rdx12, <4 x half> undef, <4 x i32> + %bin.rdx14 = fadd fast <4 x half> %bin.rdx12, %rdx.shuf13 + +; TODO: support v4f16 extractelement +; %4 = extractelement <4 x half> %bin.rdx14, i32 0 + %cmp.n = icmp eq i32 %n.vec, %N + br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %for.body.preheader15 + +for.cond.cleanup.loopexit: +; %add.lcssa = phi half [ %4, %middle.block ], [ %add, %for.body ] + %add.lcssa = phi half [ 0.000000e+00, %middle.block ], [ %add, %for.body ] + %phitmp = bitcast half %add.lcssa to i16 + %phitmp9 = zext i16 %phitmp to i32 + %phitmp10 = bitcast i32 %phitmp9 to float + br label %for.cond.cleanup + +for.cond.cleanup: + %Tmp.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp10, %for.cond.cleanup.loopexit ] + ret float %Tmp.0.lcssa + +for.body: + %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader15 ] + %Tmp.07 = phi half [ %add, %for.body ], [ %Tmp.07.ph, %for.body.preheader15 ] + %arrayidx = getelementptr inbounds half, half* %V, i32 %i.08 + %V5 = load half, half* %arrayidx, align 2 + %add = fadd fast half %V5, %Tmp.07 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.2-a,+crc,+crypto,+dsp,+fp-armv8,+fullfp16,+hwdiv,+hwdiv-arm,+neon,+ras,+strict-align,-thumb-mode" "unsafe-fp-math"="true" "use-soft-float"="false" }