Index: lib/Target/ARM/ARMCallingConv.td =================================================================== --- lib/Target/ARM/ARMCallingConv.td +++ lib/Target/ARM/ARMCallingConv.td @@ -30,8 +30,8 @@ CCIfSwiftError>>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v8f16], CCBitConvertToType>, // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>, @@ -55,8 +55,8 @@ CCIfSwiftError>>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>, @@ -69,8 +69,8 @@ //===----------------------------------------------------------------------===// def FastCC_ARM_APCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, @@ -88,8 +88,8 @@ def RetFastCC_ARM_APCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, @@ -104,8 +104,8 @@ def CC_ARM_APCS_GHC : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>, CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>, @@ -160,8 +160,8 @@ CCIfNest>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, @@ -176,8 +176,8 @@ def RetCC_ARM_AAPCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, @@ -200,8 +200,8 @@ CCIfByVal>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, @@ -221,8 +221,8 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2193,10 +2193,12 @@ default: llvm_unreachable("unhandled vld/vst lane type"); // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4f16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; // Quad-register operations: + case MVT::v8f16: case MVT::v8i16: OpcodeIndex = 0; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 1; break; @@ -2311,8 +2313,12 @@ default: llvm_unreachable("unhandled vld-dup type"); case MVT::v8i8: Opc = DOpcodes[0]; break; case MVT::v16i8: Opc = QOpcodes[0]; break; - case MVT::v4i16: Opc = DOpcodes[1]; break; - case MVT::v8i16: Opc = QOpcodes[1]; break; + case MVT::v4f16: + case MVT::v4i16: + Opc = DOpcodes[1]; break; + case MVT::v8f16: + case MVT::v8i16: + Opc = QOpcodes[1]; break; case MVT::v2f32: case MVT::v2i32: Opc = DOpcodes[2]; break; case MVT::v4f32: Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -553,6 +553,11 @@ addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); + if (Subtarget->hasFullFP16()) { + addDRTypeForNEON(MVT::v4f16); + addQRTypeForNEON(MVT::v8f16); + } + addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); addQRTypeForNEON(MVT::v16i8); @@ -1201,7 +1206,7 @@ // and vector types. Since there are 32 SPR registers and 32 DPR registers so // the cost is 1 for both f32 and f64. case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: - case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: + case MVT::v4f16: case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: RRC = &ARM::DPRRegClass; // When NEON is used for SP, only half of the register file is available // because operations that define both SP and DP results will be constrained @@ -1210,7 +1215,7 @@ if (Subtarget->useNEONForSinglePrecisionFP()) Cost = 2; break; - case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v16i8: case MVT::v8i16: case MVT::v8f16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: RRC = &ARM::DPRRegClass; Cost = 2; Index: lib/Target/ARM/ARMInstrNEON.td =================================================================== --- lib/Target/ARM/ARMInstrNEON.td +++ lib/Target/ARM/ARMInstrNEON.td @@ -6771,6 +6771,19 @@ def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (v8i8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; } + +def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; +let Predicates = [IsLE,HasFullFP16] in { + def : Pat<(v4f16 (bitconvert (v2f32 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (v2f32 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>; +} + def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; let Predicates = [IsLE] in { def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; Index: test/CodeGen/ARM/vstlane-v4.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/vstlane-v4.ll @@ -0,0 +1,59 @@ +; RUN: llc -mtriple=arm -mattr=+neon,+fullfp16 %s -o - | FileCheck %s + +define void @vst1lanehalf(half* %A, <4 x half>* %B) nounwind { +;CHECK-LABEL: vst1lanehalf: +;Check the alignment value. Max for this instruction is 16 bits: +;CHECK: vst1.16 {d16[2]}, [r0:16] + %tmp1 = load <4 x half>, <4 x half>* %B + %tmp2 = extractelement <4 x half> %tmp1, i32 2 + store half %tmp2, half* %A, align 8 + ret void +} + +define void @vst2lanehalf(half* %A, <4 x half>* %B) nounwind { +;CHECK-LABEL: vst2lanehalf: +;Check the alignment value. Max for this instruction is 32 bits: +;CHECK: vst2.16 {d16[1], d17[1]}, [r0:32] + %tmp0 = bitcast half* %A to i8* + %tmp1 = load <4 x half>, <4 x half>* %B + call void @llvm.arm.neon.vst2lane.p0i8.v4f16(i8* %tmp0, <4 x half> %tmp1, <4 x half> %tmp1, i32 1, i32 8) + ret void +} + +;Check for a post-increment updating store with register increment. +define void @vst2lanehalf_update(half** %ptr, <4 x half>* %B, i32 %inc) nounwind { +;CHECK-LABEL: vst2lanehalf_update: +;CHECK: vst2.16 {d16[1], d17[1]}, [r1], r2 + %A = load half*, half** %ptr + %tmp0 = bitcast half* %A to i8* + %tmp1 = load <4 x half>, <4 x half>* %B + call void @llvm.arm.neon.vst2lane.p0i8.v4f16(i8* %tmp0, <4 x half> %tmp1, <4 x half> %tmp1, i32 1, i32 2) + %tmp2 = getelementptr half, half* %A, i32 %inc + store half* %tmp2, half** %ptr + ret void +} + +declare void @llvm.arm.neon.vst2lane.p0i8.v4f16(i8*, <4 x half>, <4 x half>, i32, i32) nounwind + +define void @vst3lanehalf(half* %A, <4 x half>* %B) nounwind { +;CHECK-LABEL: vst3lanehalf: +;Check the (default) alignment value. VST3 does not support alignment. +;CHECK: vst3.16 {d16[1], d17[1], d18[1]}, [r0] + %tmp0 = bitcast half* %A to i8* + %tmp1 = load <4 x half>, <4 x half>* %B + call void @llvm.arm.neon.vst3lane.p0i8.v4f16(i8* %tmp0, <4 x half> %tmp1, <4 x half> %tmp1, <4 x half> %tmp1, i32 1, i32 8) + ret void +} + +declare void @llvm.arm.neon.vst3lane.p0i8.v4f16(i8*, <4 x half>, <4 x half>, <4 x half>, i32, i32) nounwind + +define void @vst4lanehalf(half* %A, <4 x half>* %B) nounwind { +;CHECK-LABEL: vst4lanehalf: +;CHECK: vst4.16 + %tmp0 = bitcast half* %A to i8* + %tmp1 = load <4 x half>, <4 x half>* %B + call void @llvm.arm.neon.vst4lane.p0i8.v4f16(i8* %tmp0, <4 x half> %tmp1, <4 x half> %tmp1, <4 x half> %tmp1, <4 x half> %tmp1, i32 1, i32 1) + ret void +} + +declare void @llvm.arm.neon.vst4lane.p0i8.v4f16(i8*, <4 x half>, <4 x half>, <4 x half>, <4 x half>, i32, i32) nounwind Index: test/CodeGen/ARM/vstlane-v8.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/vstlane-v8.ll @@ -0,0 +1,52 @@ +; RUN: llc -mtriple=arm -mattr=+neon,+fullfp16 %s -o - | FileCheck %s + +define void @vst1laneQhalf(half* %A, <8 x half>* %B) nounwind { +;CHECK-LABEL: vst1laneQhalf: +;CHECK: vst1.16 {d17[1]}, [r0:16] + %tmp1 = load <8 x half>, <8 x half>* %B + %tmp2 = extractelement <8 x half> %tmp1, i32 5 + store half %tmp2, half* %A, align 8 + ret void +} + +define void @vst2laneQhalf(half* %A, <8 x half>* %B) nounwind { +;CHECK-LABEL: vst2laneQhalf: +;Check the (default) alignment. +;CHECK: vst2.16 {d17[1], d19[1]}, [r0] + %tmp0 = bitcast half* %A to i8* + %tmp1 = load <8 x half>, <8 x half>* %B + call void @llvm.arm.neon.vst2lane.p0i8.v8f16(i8* %tmp0, <8 x half> %tmp1, <8 x half> %tmp1, i32 5, i32 1) + ret void +} + +declare void @llvm.arm.neon.vst2lane.p0i8.v8f16(i8*, <8 x half>, <8 x half>, i32, i32) nounwind + +define void @vst3laneQhalf(half* %A, <8 x half>* %B) nounwind { +;CHECK-LABEL: vst3laneQhalf: +;Check the (default) alignment value. VST3 does not support alignment. +;CHECK: vst3.16 {d17[2], d19[2], d21[2]}, [r0] + %tmp0 = bitcast half* %A to i8* + %tmp1 = load <8 x half>, <8 x half>* %B + call void @llvm.arm.neon.vst3lane.p0i8.v8f16(i8* %tmp0, <8 x half> %tmp1, <8 x half> %tmp1, <8 x half> %tmp1, i32 6, i32 8) + ret void +} + +declare void @llvm.arm.neon.vst3lane.p0i8.v8f16(i8*, <8 x half>, <8 x half>, <8 x half>, i32, i32) nounwind + +define void @vst4laneQhalf(half* %A, <8 x half>* %B) nounwind { +;CHECK-LABEL: vst4laneQhalf: +;Check the alignment value. Max for this instruction is 64 bits: +;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0:64] + %tmp0 = bitcast half* %A to i8* + %tmp1 = load <8 x half>, <8 x half>* %B + call void @llvm.arm.neon.vst4lane.p0i8.v8f16(i8* %tmp0, <8 x half> %tmp1, <8 x half> %tmp1, <8 x half> %tmp1, <8 x half> %tmp1, i32 7, i32 16) + ret void +} + +define <8 x half> @variable_insertelement(<8 x half> %a, half %b, i32 %c) nounwind readnone { +;CHECK-LABEL: variable_insertelement: + %r = insertelement <8 x half> %a, half %b, i32 %c + ret <8 x half> %r +} + +declare void @llvm.arm.neon.vst4lane.p0i8.v8f16(i8*, <8 x half>, <8 x half>, <8 x half>, <8 x half>, i32, i32) nounwind