Index: llvm/trunk/include/llvm/IR/IntrinsicsARM.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsARM.td +++ llvm/trunk/include/llvm/IR/IntrinsicsARM.td @@ -717,4 +717,14 @@ def int_arm_neon_sha256h2: SHA_3Arg_v4i32_Intrinsic; def int_arm_neon_sha256su1: SHA_3Arg_v4i32_Intrinsic; +// Armv8.2-A dot product instructions +class Neon_Dot_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, + LLVMMatchType<1>], + [IntrNoMem]>; +def int_arm_neon_udot : Neon_Dot_Intrinsic; +def int_arm_neon_sdot : Neon_Dot_Intrinsic; + + } // end TargetPrefix Index: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td +++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td @@ -4682,37 +4682,59 @@ // We put them in the VFPV8 decoder namespace because the ARM and Thumb // encodings are the same and thus no further bit twiddling is necessary // in the disassembler. -let Predicates = [HasDotProd], DecoderNamespace = "VFPV8" in { - -def VUDOTD : N3Vnp<0b11000, 0b10, 0b1101, 0b0, 0b1, - (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), - N3RegFrm, IIC_VDOTPROD, "vudot", "u8", []>; -def VSDOTD : N3Vnp<0b11000, 0b10, 0b1101, 0b0, 0b0, - (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), - N3RegFrm, IIC_VDOTPROD, "vsdot", "s8", []>; -def VUDOTQ : N3Vnp<0b11000, 0b10, 0b1101, 0b1, 0b1, - (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), - N3RegFrm, IIC_VDOTPROD, "vudot", "u8", []>; -def VSDOTQ : N3Vnp<0b11000, 0b10, 0b1101, 0b1, 0b0, - (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), - N3RegFrm, IIC_VDOTPROD, "vsdot", "s8", []>; +class VDOT : + N3Vnp<0b11000, 0b10, 0b1101, op6, op4, (outs RegTy:$dst), + (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD, + Asm, AsmTy, + [(set (AccumTy RegTy:$dst), + (OpNode (AccumTy RegTy:$Vd), + (InputTy RegTy:$Vn), + (InputTy RegTy:$Vm)))]> { + let Predicates = [HasDotProd]; + let DecoderNamespace = "VFPV8"; + let Constraints = "$dst = $Vd"; +} + +def VUDOTD : VDOT<0, 1, DPR, "vudot", "u8", v2i32, v8i8, int_arm_neon_udot>; +def VSDOTD : VDOT<0, 0, DPR, "vsdot", "s8", v2i32, v8i8, int_arm_neon_sdot>; +def VUDOTQ : VDOT<1, 1, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>; +def VSDOTQ : VDOT<1, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>; // Indexed dot product instructions: -class DOTI : - N3Vnp<0b11100, 0b10, 0b1101, Q, U, - (outs Ty:$Vd), (ins Ty:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), - N3RegFrm, IIC_VDOTPROD, opc, dt, []> { - bit lane; - let Inst{5} = lane; - let AsmString = !strconcat(opc, ".", dt, "\t$Vd, $Vn, $Vm$lane"); -} - -def VUDOTDI : DOTI<"vudot", "u8", 0b0, 0b1, DPR>; -def VSDOTDI : DOTI<"vsdot", "s8", 0b0, 0b0, DPR>; -def VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR>; -def VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR>; +multiclass DOTI { + def "" : N3Vnp<0b11100, 0b10, 0b1101, Q, U, (outs Ty:$dst), + (ins Ty:$Vd, Ty:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), + N3RegFrm, IIC_VDOTPROD, opc, dt, []> { + bit lane; + let Inst{5} = lane; + let AsmString = !strconcat(opc, ".", dt, "\t$Vd, $Vn, $Vm$lane"); + let Constraints = "$dst = $Vd"; + let Predicates = [HasDotProd]; + let DecoderNamespace = "VFPV8"; + } + + def : Pat< + (AccumType (OpNode (AccumType Ty:$Vd), + (InputType Ty:$Vn), + (InputType (bitconvert (AccumType + (NEONvduplane (AccumType Ty:$Vm), + VectorIndex32:$lane)))))), + (!cast(NAME) Ty:$Vd, Ty:$Vn, RHS, VectorIndex32:$lane)>; +} + +defm VUDOTDI : DOTI<"vudot", "u8", 0b0, 0b1, DPR, v2i32, v8i8, + int_arm_neon_udot, (v2i32 DPR_VFP2:$Vm)>; +defm VSDOTDI : DOTI<"vsdot", "s8", 0b0, 0b0, DPR, v2i32, v8i8, + int_arm_neon_sdot, (v2i32 DPR_VFP2:$Vm)>; +defm VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR, v4i32, v16i8, + int_arm_neon_udot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; +defm VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR, v4i32, v16i8, + int_arm_neon_sdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; -} // HasDotProd // ARMv8.3 complex operations class BaseN3VCP8ComplexTied @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) +declare <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) +declare <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) +declare <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) + +define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 { +entry: +; CHECK-LABEL: test_vdot_u32: +; CHECK: vudot.u8 d0, d1, d2 + %vdot1.i = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2 + ret <2 x i32> %vdot1.i +} + +define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 { +entry: +; CHECK-LABEL: test_vdotq_u32: +; CHECK: vudot.u8 q0, q1, q2 + %vdot1.i = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2 + ret <4 x i32> %vdot1.i +} + +define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 { +entry: +; CHECK-LABEL: test_vdot_s32: +; CHECK: vsdot.s8 d0, d1, d2 + %vdot1.i = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2 + ret <2 x i32> %vdot1.i +} + +define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 { +entry: +; CHECK-LABEL: test_vdotq_s32: +; CHECK: vsdot.s8 q0, q1, q2 + %vdot1.i = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2 + ret <4 x i32> %vdot1.i +} + +define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) { +entry: +; CHECK-LABEL: test_vdot_lane_u32: +; CHECK: vudot.u8 d0, d1, d2[1] + %.cast = bitcast <8 x i8> %c to <2 x i32> + %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> + %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> + %vdot1.i = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2 + ret <2 x i32> %vdot1.i +} + +define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) { +entry: +; CHECK-LABEL: test_vdotq_lane_u32: +; CHECK: vudot.u8 q0, q1, d4[1] + %.cast = bitcast <8 x i8> %c to <2 x i32> + %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> + %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> + %vdot1.i = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2 + ret <4 x i32> %vdot1.i +} + +define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) { +entry: +; CHECK-LABEL: test_vdot_lane_s32: +; CHECK: vsdot.s8 d0, d1, d2[1] + %.cast = bitcast <8 x i8> %c to <2 x i32> + %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> + %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8> + %vdot1.i = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2 + ret <2 x i32> %vdot1.i +} + +define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) { +entry: +; CHECK-LABEL: test_vdotq_lane_s32: +; CHECK: vsdot.s8 q0, q1, d4[1] + %.cast = bitcast <8 x i8> %c to <2 x i32> + %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> + %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8> + %vdot1.i = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2 + ret <4 x i32> %vdot1.i +}