diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -100,6 +100,11 @@ BUILTIN(__builtin_altivec_vmulosh, "V4SiV8SsV8Ss", "") BUILTIN(__builtin_altivec_vmulouw, "V2ULLiV4UiV4Ui", "") BUILTIN(__builtin_altivec_vmulosw, "V2SLLiV4SiV4Si", "") +BUILTIN(__builtin_altivec_vmuleud, "V1ULLLiV2ULLiV2ULLi", "") +BUILTIN(__builtin_altivec_vmulesd, "V1SLLLiV2SLLiV2SLLi", "") +BUILTIN(__builtin_altivec_vmuloud, "V1ULLLiV2ULLiV2ULLi", "") +BUILTIN(__builtin_altivec_vmulosd, "V1SLLLiV2SLLiV2SLLi", "") +BUILTIN(__builtin_altivec_vmsumcud, "V1ULLLiV2ULLiV2ULLiV1ULLLi", "") BUILTIN(__builtin_altivec_vnmsubfp, "V4fV4fV4fV4f", "") diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h --- a/clang/lib/Headers/altivec.h +++ b/clang/lib/Headers/altivec.h @@ -5487,6 +5487,16 @@ return __builtin_altivec_vmsumuhm(__a, __b, __c); } +/* vec_msumc */ + +#ifdef __POWER10_VECTOR__ +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_msumc(vector unsigned long long __a, vector unsigned long long __b, + vector unsigned __int128 __c) { + return __builtin_altivec_vmsumcud(__a, __b, __c); +} +#endif + /* vec_vmsummbm */ static __inline__ vector int __attribute__((__always_inline__)) @@ -5713,6 +5723,26 @@ } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_mule(vector signed long long __a, vector signed long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmulosd(__a, __b); +#else + return __builtin_altivec_vmulesd(__a, __b); +#endif +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_mule(vector unsigned long long __a, vector unsigned long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmuloud(__a, __b); +#else + return __builtin_altivec_vmuleud(__a, __b); +#endif +} +#endif + /* vec_vmulesb */ static __inline__ vector short __attribute__((__always_inline__)) @@ -5839,6 +5869,26 @@ } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_mulo(vector signed long long __a, vector signed long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmulesd(__a, __b); +#else + return __builtin_altivec_vmulosd(__a, __b); +#endif +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_mulo(vector unsigned long long __a, vector unsigned long long __b) { +#ifdef __LITTLE_ENDIAN__ + return __builtin_altivec_vmuleud(__a, __b); +#else + return __builtin_altivec_vmuloud(__a, __b); +#endif +} +#endif + /* vec_vmulosb */ static __inline__ vector short __attribute__((__always_inline__)) diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c --- a/clang/test/CodeGen/builtins-ppc-p10vector.c +++ b/clang/test/CodeGen/builtins-ppc-p10vector.c @@ -928,6 +928,44 @@ return vec_test_lsbb_all_zeros(vuca); } +vector unsigned __int128 test_vec_mule_u128(void) { + // CHECK-BE: @llvm.ppc.altivec.vmuleud(<2 x i64> + // CHECK-BE-NEXT: ret <1 x i128> + // CHECK-LE: @llvm.ppc.altivec.vmuloud(<2 x i64> + // CHECK-LE-NEXT: ret <1 x i128> + return vec_mule(vulla, vullb); +} + +vector signed __int128 test_vec_mule_s128(void) { + // CHECK-BE: @llvm.ppc.altivec.vmulesd(<2 x i64> + // CHECK-BE-NEXT: ret <1 x i128> + // CHECK-LE: @llvm.ppc.altivec.vmulosd(<2 x i64> + // CHECK-LE-NEXT: ret <1 x i128> + return vec_mule(vslla, vsllb); +} + +vector unsigned __int128 test_vec_mulo_u128(void) { + // CHECK-BE: @llvm.ppc.altivec.vmuloud(<2 x i64> + // CHECK-BE-NEXT: ret <1 x i128> + // CHECK-LE: @llvm.ppc.altivec.vmuleud(<2 x i64> + // CHECK-LE-NEXT: ret <1 x i128> + return vec_mulo(vulla, vullb); +} + +vector signed __int128 test_vec_mulo_s128(void) { + // CHECK-BE: @llvm.ppc.altivec.vmulosd(<2 x i64> + // CHECK-BE-NEXT: ret <1 x i128> + // CHECK-LE: @llvm.ppc.altivec.vmulesd(<2 x i64> + // CHECK-LE-NEXT: ret <1 x i128> + return vec_mulo(vslla, vsllb); +} + +vector unsigned __int128 test_vec_msumc_u128(void) { + // CHECK: @llvm.ppc.altivec.vmsumcud(<2 x i64> + // CHECK-NEXT: ret <1 x i128> + return vec_msumc(vulla, vullb, vui128a); +} + vector signed __int128 test_vec_xl_sext_i8(void) { // CHECK: load i8 // CHECK: sext i8 diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -193,6 +193,13 @@ [llvm_v1i128_ty], [llvm_v1i128_ty, llvm_v1i128_ty], [IntrNoMem]>; +/// PowerPC_Vec_QDD_Intrinsic - A PowerPC intrinsic that takes two v2i64 +/// vectors and returns one v1i128. These intrinsics have no side effects. +class PowerPC_Vec_QDD_Intrinsic + : PowerPC_Vec_Intrinsic; + //===----------------------------------------------------------------------===// // PowerPC VSX Intrinsic Class Definitions. // @@ -673,6 +680,9 @@ def int_ppc_altivec_vmsumuhs : GCCBuiltin<"__builtin_altivec_vmsumuhs">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty], [IntrNoMem]>; + def int_ppc_altivec_vmsumcud : GCCBuiltin<"__builtin_altivec_vmsumcud">, + Intrinsic<[llvm_v1i128_ty], + [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v1i128_ty], [IntrNoMem]>; // Vector Multiply Instructions. def int_ppc_altivec_vmulesb : GCCBuiltin<"__builtin_altivec_vmulesb">, @@ -684,6 +694,7 @@ def int_ppc_altivec_vmulesw : GCCBuiltin<"__builtin_altivec_vmulesw">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + def int_ppc_altivec_vmulesd : PowerPC_Vec_QDD_Intrinsic<"vmulesd">; def int_ppc_altivec_vmuleub : GCCBuiltin<"__builtin_altivec_vmuleub">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; @@ -693,6 +704,7 @@ def int_ppc_altivec_vmuleuw : GCCBuiltin<"__builtin_altivec_vmuleuw">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + def int_ppc_altivec_vmuleud : PowerPC_Vec_QDD_Intrinsic<"vmuleud">; def int_ppc_altivec_vmulosb : GCCBuiltin<"__builtin_altivec_vmulosb">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], @@ -703,6 +715,7 @@ def int_ppc_altivec_vmulosw : GCCBuiltin<"__builtin_altivec_vmulosw">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + def int_ppc_altivec_vmulosd : PowerPC_Vec_QDD_Intrinsic<"vmulosd">; def int_ppc_altivec_vmuloub : GCCBuiltin<"__builtin_altivec_vmuloub">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; @@ -712,6 +725,7 @@ def int_ppc_altivec_vmulouw : GCCBuiltin<"__builtin_altivec_vmulouw">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + def int_ppc_altivec_vmuloud : PowerPC_Vec_QDD_Intrinsic<"vmuloud">; // Vector Sum Instructions. def int_ppc_altivec_vsumsws : GCCBuiltin<"__builtin_altivec_vsumsws">, diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -1256,16 +1256,25 @@ } def VMULESD : VXForm_1<968, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulesd $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulesd $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmulesd v2i64:$vA, + v2i64:$vB))]>; def VMULEUD : VXForm_1<712, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmuleud $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmuleud $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmuleud v2i64:$vA, + v2i64:$vB))]>; def VMULOSD : VXForm_1<456, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmulosd $vD, $vA, $vB", IIC_VecGeneral, []>; + "vmulosd $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmulosd v2i64:$vA, + v2i64:$vB))]>; def VMULOUD : VXForm_1<200, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vmuloud $vD, $vA, $vB", IIC_VecGeneral, []>; - def VMSUMCUD : VAForm_1a<23, (outs vrrc:$vD), - (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), - "vmsumcud $vD, $vA, $vB, $vC", IIC_VecGeneral, []>; + "vmuloud $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmuloud v2i64:$vA, + v2i64:$vB))]>; + def VMSUMCUD : VAForm_1a<23, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), + "vmsumcud $vD, $vA, $vB, $vC", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmsumcud + v2i64:$vA, v2i64:$vB, v1i128:$vC))]>; def VDIVSQ : VXForm_1<267, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vdivsq $vD, $vA, $vB", IIC_VecGeneral, []>; def VDIVUQ : VXForm_1<11, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), diff --git a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll --- a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll +++ b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll @@ -10,6 +10,7 @@ ; This includes the low order and high order versions of vector multiply. ; The low order version operates on doublewords, whereas the high order version ; operates on signed and unsigned words and doublewords. +; This file also includes 128 bit vector multiply instructions. define <2 x i64> @test_vmulld(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_vmulld: @@ -122,3 +123,54 @@ %mulh = tail call <2 x i64> @llvm.ppc.altivec.vmulhud(<2 x i64> %a, <2 x i64> %b) ret <2 x i64> %mulh } + +declare <1 x i128> @llvm.ppc.altivec.vmuleud(<2 x i64>, <2 x i64>) nounwind readnone +declare <1 x i128> @llvm.ppc.altivec.vmuloud(<2 x i64>, <2 x i64>) nounwind readnone +declare <1 x i128> @llvm.ppc.altivec.vmulesd(<2 x i64>, <2 x i64>) nounwind readnone +declare <1 x i128> @llvm.ppc.altivec.vmulosd(<2 x i64>, <2 x i64>) nounwind readnone +declare <1 x i128> @llvm.ppc.altivec.vmsumcud(<2 x i64>, <2 x i64>, <1 x i128>) nounwind readnone + +define <1 x i128> @test_vmuleud(<2 x i64> %x, <2 x i64> %y) nounwind readnone { +; CHECK-LABEL: test_vmuleud: +; CHECK: # %bb.0: +; CHECK-NEXT: vmuleud v2, v2, v3 +; CHECK-NEXT: blr + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vmuleud(<2 x i64> %x, <2 x i64> %y) + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vmuloud(<2 x i64> %x, <2 x i64> %y) nounwind readnone { +; CHECK-LABEL: test_vmuloud: +; CHECK: # %bb.0: +; CHECK-NEXT: vmuloud v2, v2, v3 +; CHECK-NEXT: blr + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vmuloud(<2 x i64> %x, <2 x i64> %y) + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vmulesd(<2 x i64> %x, <2 x i64> %y) nounwind readnone { +; CHECK-LABEL: test_vmulesd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulesd v2, v2, v3 +; CHECK-NEXT: blr + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vmulesd(<2 x i64> %x, <2 x i64> %y) + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vmulosd(<2 x i64> %x, <2 x i64> %y) nounwind readnone { +; CHECK-LABEL: test_vmulosd: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulosd v2, v2, v3 +; CHECK-NEXT: blr + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vmulosd(<2 x i64> %x, <2 x i64> %y) + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vmsumcud(<2 x i64> %x, <2 x i64> %y, <1 x i128> %z) nounwind readnone { +; CHECK-LABEL: test_vmsumcud: +; CHECK: # %bb.0: +; CHECK-NEXT: vmsumcud v2, v2, v3, v4 +; CHECK-NEXT: blr + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vmsumcud(<2 x i64> %x, <2 x i64> %y, <1 x i128> %z) + ret <1 x i128> %tmp +}