diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -100,6 +100,11 @@ BUILTIN(__builtin_altivec_vmulosh, "V4SiV8SsV8Ss", "") BUILTIN(__builtin_altivec_vmulouw, "V2ULLiV4UiV4Ui", "") BUILTIN(__builtin_altivec_vmulosw, "V2SLLiV4SiV4Si", "") +BUILTIN(__builtin_altivec_vmuleud, "V1ULLLiV2ULLiV2ULLi", "") +BUILTIN(__builtin_altivec_vmulesd, "V1SLLLiV2SLLiV2SLLi", "") +BUILTIN(__builtin_altivec_vmuloud, "V1ULLLiV2ULLiV2ULLi", "") +BUILTIN(__builtin_altivec_vmulosd, "V1SLLLiV2SLLiV2SLLi", "") +BUILTIN(__builtin_altivec_vmsumcud, "V1ULLLiV2ULLiV2ULLiV1ULLLi", "") BUILTIN(__builtin_altivec_vnmsubfp, "V4fV4fV4fV4f", "") diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h --- a/clang/lib/Headers/altivec.h +++ b/clang/lib/Headers/altivec.h @@ -5467,6 +5467,16 @@ return __builtin_altivec_vmsumuhm(__a, __b, __c); } +/* vec_msumc */ + +#ifdef __POWER10_VECTOR__ +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_msumc(vector unsigned long long __a, vector unsigned long long __b, + vector unsigned __int128 __c) { + return __builtin_altivec_vmsumcud(__a, __b, __c); +} +#endif + /* vec_vmsummbm */ static __inline__ vector int __attribute__((__always_inline__)) @@ -5693,6 +5703,26 @@ } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_mule(vector signed long long __a, vector signed long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmulosd(__a, __b); +#else + return __builtin_altivec_vmulesd(__a, __b); +#endif +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_mule(vector unsigned long long __a, vector unsigned long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmuloud(__a, __b); +#else + return __builtin_altivec_vmuleud(__a, __b); +#endif +} +#endif + /* vec_vmulesb */ static __inline__ vector short __attribute__((__always_inline__)) @@ -5795,6 +5825,26 @@ } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_mulo(vector signed long long __a, vector signed long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmulesd(__a, __b); +#else + return __builtin_altivec_vmulosd(__a, __b); +#endif +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_mulo(vector unsigned long long __a, vector unsigned long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmuleud(__a, __b); +#else + return __builtin_altivec_vmuloud(__a, __b); +#endif +} +#endif + /* vec_vmulosb */ static __inline__ vector short __attribute__((__always_inline__)) diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c --- a/clang/test/CodeGen/builtins-ppc-p10vector.c +++ b/clang/test/CodeGen/builtins-ppc-p10vector.c @@ -593,3 +593,41 @@ // CHECK-NEXT: ret i32 return vec_test_lsbb_all_zeros(vuca); } + +vector unsigned __int128 test_vec_mule_u128(void) { + // CHECK-BE: @llvm.ppc.altivec.vmuleud(<2 x i64> + // CHECK-BE-NEXT: ret <1 x i128> + // CHECK-LE: @llvm.ppc.altivec.vmuloud(<2 x i64> + // CHECK-LE-NEXT: ret <1 x i128> + return vec_mule(vulla, vullb); +} + +vector signed __int128 test_vec_mule_s128(void) { + // CHECK-BE: @llvm.ppc.altivec.vmulesd(<2 x i64> + // CHECK-BE-NEXT: ret <1 x i128> + // CHECK-LE: @llvm.ppc.altivec.vmulosd(<2 x i64> + // CHECK-LE-NEXT: ret <1 x i128> + return vec_mule(vslla, vsllb); +} + +vector unsigned __int128 test_vec_mulo_u128(void) { + // CHECK-BE: @llvm.ppc.altivec.vmuloud(<2 x i64> + // CHECK-BE-NEXT: ret <1 x i128> + // CHECK-LE: @llvm.ppc.altivec.vmuleud(<2 x i64> + // CHECK-LE-NEXT: ret <1 x i128> + return vec_mulo(vulla, vullb); +} + +vector signed __int128 test_vec_mulo_s128(void) { + // CHECK-BE: @llvm.ppc.altivec.vmulosd(<2 x i64> + // CHECK-BE-NEXT: ret <1 x i128> + // CHECK-LE: @llvm.ppc.altivec.vmulesd(<2 x i64> + // CHECK-LE-NEXT: ret <1 x i128> + return vec_mulo(vslla, vsllb); +} + +vector unsigned __int128 test_vec_msumc_u128(void) { + // CHECK: @llvm.ppc.altivec.vmsumcud(<2 x i64> + // CHECK-NEXT: ret <1 x i128> + return vec_msumc(vulla, vullb, vui128a); +} diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -186,6 +186,13 @@ [llvm_v1i128_ty], [llvm_v1i128_ty, llvm_v1i128_ty], [IntrNoMem]>; +/// PowerPC_Vec_QDD_Intrinsic - A PowerPC intrinsic that takes two v2i64 +/// vectors and returns one v1i128. These intrinsics have no side effects. +class PowerPC_Vec_QDD_Intrinsic + : PowerPC_Vec_Intrinsic; + //===----------------------------------------------------------------------===// // PowerPC VSX Intrinsic Class Definitions. // @@ -621,6 +628,9 @@ def int_ppc_altivec_vmsumuhs : GCCBuiltin<"__builtin_altivec_vmsumuhs">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty], [IntrNoMem]>; + def int_ppc_altivec_vmsumcud : GCCBuiltin<"__builtin_altivec_vmsumcud">, + Intrinsic<[llvm_v1i128_ty], + [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v1i128_ty], [IntrNoMem]>; // Vector Multiply Instructions. def int_ppc_altivec_vmulesb : GCCBuiltin<"__builtin_altivec_vmulesb">, @@ -632,6 +642,7 @@ def int_ppc_altivec_vmulesw : GCCBuiltin<"__builtin_altivec_vmulesw">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + def int_ppc_altivec_vmulesd : PowerPC_Vec_QDD_Intrinsic<"vmulesd">; def int_ppc_altivec_vmuleub : GCCBuiltin<"__builtin_altivec_vmuleub">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; @@ -641,6 +652,7 @@ def int_ppc_altivec_vmuleuw : GCCBuiltin<"__builtin_altivec_vmuleuw">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + def int_ppc_altivec_vmuleud : PowerPC_Vec_QDD_Intrinsic<"vmuleud">; def int_ppc_altivec_vmulosb : GCCBuiltin<"__builtin_altivec_vmulosb">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], @@ -651,6 +663,7 @@ def int_ppc_altivec_vmulosw : GCCBuiltin<"__builtin_altivec_vmulosw">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + def int_ppc_altivec_vmulosd : PowerPC_Vec_QDD_Intrinsic<"vmulosd">; def int_ppc_altivec_vmuloub : GCCBuiltin<"__builtin_altivec_vmuloub">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; @@ -660,6 +673,7 @@ def int_ppc_altivec_vmulouw : GCCBuiltin<"__builtin_altivec_vmulouw">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + def int_ppc_altivec_vmuloud : PowerPC_Vec_QDD_Intrinsic<"vmuloud">; // Vector Sum Instructions. def int_ppc_altivec_vsumsws : GCCBuiltin<"__builtin_altivec_vsumsws">, diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -993,16 +993,26 @@ } def VMULESD : VXForm_1<968, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulesd $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulesd $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmulesd v2i64:$vA, + v2i64:$vB))]>; def VMULEUD : VXForm_1<712, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmuleud $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmuleud $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmuleud v2i64:$vA, + v2i64:$vB))]>; def VMULOSD : VXForm_1<456, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulosd $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulosd $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmulosd v2i64:$vA, + v2i64:$vB))]>; def VMULOUD : VXForm_1<200, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmuloud $vD, $vA, $vB", IIC_VecGeneral, []>; - def VMSUMCUD : VAForm_1a<23, (outs vrrc:$vD), - (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), - "vmsumcud $vD, $vA, $vB, $vC", IIC_VecGeneral, []>; + "vmuloud $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmuloud v2i64:$vA, + v2i64:$vB))]>; + def VMSUMCUD : VAForm_1a<23, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), + "vmsumcud $vD, $vA, $vB, $vC", IIC_VecGeneral, + [(set v1i128:$vD, + (int_ppc_altivec_vmsumcud v2i64:$vA, v2i64:$vB, + v1i128:$vC))]>; def VDIVSQ : VXForm_1<267, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vdivsq $vD, $vA, $vB", IIC_VecGeneral, []>; def VDIVUQ : VXForm_1<11, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), diff --git a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s +; This test case aims to test the vector multiply instructions on Power10. + +declare <1 x i128> @llvm.ppc.altivec.vmuleud(<2 x i64>, <2 x i64>) nounwind readnone +declare <1 x i128> @llvm.ppc.altivec.vmuloud(<2 x i64>, <2 x i64>) nounwind readnone +declare <1 x i128> @llvm.ppc.altivec.vmulesd(<2 x i64>, <2 x i64>) nounwind readnone +declare <1 x i128> @llvm.ppc.altivec.vmulosd(<2 x i64>, <2 x i64>) nounwind readnone +declare <1 x i128> @llvm.ppc.altivec.vmsumcud(<2 x i64>, <2 x i64>, <1 x i128>) nounwind readnone + +define <1 x i128> @test_vmuleud(<2 x i64> %x, <2 x i64> %y) nounwind readnone { +; CHECK-LABEL: test_vmuleud: +; CHECK: # %bb.0: +; CHECK-NEXT: vmuleud v2, v2, v3 +; CHECK-NEXT: blr + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vmuleud(<2 x i64> %x, <2 x i64> %y) + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vmuloud(<2 x i64> %x, <2 x i64> %y) nounwind readnone { +; CHECK-LABEL: test_vmuloud: +; CHECK: # %bb.0: +; CHECK-NEXT: vmuloud v2, v2, v3 +; CHECK-NEXT: blr + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vmuloud(<2 x i64> %x, <2 x i64> %y) + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vmulesd(<2 x i64> %x, <2 x i64> %y) nounwind readnone { +; CHECK-LABEL: test_vmulesd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulesd v2, v2, v3 +; CHECK-NEXT: blr + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vmulesd(<2 x i64> %x, <2 x i64> %y) + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vmulosd(<2 x i64> %x, <2 x i64> %y) nounwind readnone { +; CHECK-LABEL: test_vmulosd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulosd v2, v2, v3 +; CHECK-NEXT: blr + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vmulosd(<2 x i64> %x, <2 x i64> %y) + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vmsumcud(<2 x i64> %x, <2 x i64> %y, <1 x i128> %z) nounwind readnone { +; CHECK-LABEL: test_vmsumcud: +; CHECK: # %bb.0: +; CHECK-NEXT: vmsumcud v2, v2, v3, v4 +; CHECK-NEXT: blr + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vmsumcud(<2 x i64> %x, <2 x i64> %y, <1 x i128> %z) + ret <1 x i128> %tmp +}