diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -422,6 +422,10 @@ BUILTIN(__builtin_altivec_vextddvlx, "V2ULLiV2ULLiV2ULLiUi", "") BUILTIN(__builtin_altivec_vextddvrx, "V2ULLiV2ULLiV2ULLiUi", "") +// P10 Vector rotate built-ins. +BUILTIN(__builtin_altivec_vrlqmi, "V1ULLLiV1ULLLiV1ULLLiV1ULLLi", "") +BUILTIN(__builtin_altivec_vrlqnm, "V1ULLLiV1ULLLiV1ULLLi", "") + // VSX built-ins. BUILTIN(__builtin_vsx_lxvd2x, "V2divC*", "") diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h --- a/clang/lib/Headers/altivec.h +++ b/clang/lib/Headers/altivec.h @@ -7927,6 +7927,18 @@ } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_rl(vector signed __int128 __a, vector unsigned __int128 __b) { + return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector signed __int128)) - __a)); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_rl(vector unsigned __int128 __a, vector unsigned __int128 __b) { + return (__b << __a)|(__b >> ((__CHAR_BIT__ * sizeof(vector unsigned __int128)) - __a)); +} +#endif + /* vec_rlmi */ #ifdef __POWER9_VECTOR__ static __inline__ vector unsigned int __ATTRS_o_ai @@ -7940,8 +7952,24 @@ vector unsigned long long __c) { return __builtin_altivec_vrldmi(__a, __c, __b); } +#endif + +#ifdef __POWER10_VECTOR__ +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_rlmi(vector unsigned __int128 __a, vector unsigned __int128 __b, + vector unsigned __int128 __c) { + return __builtin_altivec_vrlqmi(__a, __c, __b); +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_rlmi(vector signed __int128 __a, vector signed __int128 __b, + vector signed __int128 __c) { + return __builtin_altivec_vrlqmi(__a, __c, __b); +} +#endif /* vec_rlnm */ +#ifdef __POWER9_VECTOR__ static __inline__ vector unsigned int __ATTRS_o_ai vec_rlnm(vector unsigned int __a, vector unsigned int __b, vector unsigned int __c) { @@ -7957,6 +7985,42 @@ } #endif +#ifdef __POWER10_VECTOR__ +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_rlnm(vector unsigned __int128 __a, vector unsigned __int128 __b, + vector unsigned __int128 __c) { + // Merge __b and __c using an appropriate shuffle. + vector unsigned char TmpB = (vector unsigned char)__b; + vector unsigned char TmpC = (vector unsigned char)__c; + vector unsigned char MaskAndShift = +#ifdef __LITTLE_ENDIAN__ + __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 0, + 1, -1, -1, -1, -1, -1); +#else + __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 31, 30, 15, -1, + -1, -1, -1, -1, -1, -1, -1); +#endif + return __builtin_altivec_vrlqnm(__a, (vector unsigned __int128) MaskAndShift); +} + +static __inline__ vector signed __int128 __ATTRS_o_ai +vec_rlnm(vector signed __int128 __a, vector signed __int128 __b, + vector signed __int128 __c) { + // Merge __b and __c using an appropriate shuffle. + vector unsigned char TmpB = (vector unsigned char)__b; + vector unsigned char TmpC = (vector unsigned char)__c; + vector unsigned char MaskAndShift = +#ifdef __LITTLE_ENDIAN__ + __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, -1, -1, -1, 16, 0, + 1, -1, -1, -1, -1, -1); +#else + __builtin_shufflevector(TmpB, TmpC, -1, -1, -1, -1, -1, 31, 30, 15, -1, + -1, -1, -1, -1, -1, -1, -1); +#endif + return __builtin_altivec_vrlqnm(__a, (vector unsigned __int128) MaskAndShift); +} +#endif + /* vec_vrlb */ static __inline__ vector signed char __ATTRS_o_ai diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c --- a/clang/test/CodeGen/builtins-ppc-p10vector.c +++ b/clang/test/CodeGen/builtins-ppc-p10vector.c @@ -17,7 +17,7 @@ vector unsigned int vuia, vuib, vuic; vector signed long long vslla, vsllb; vector unsigned long long vulla, vullb, vullc; -vector signed __int128 vsi128a, vsi128b; +vector signed __int128 vsi128a, vsi128b, vsi128c; vector unsigned __int128 vui128a, vui128b, vui128c; vector float vfa, vfb; vector double vda, vdb; @@ -1880,3 +1880,53 @@ // CHECK-NEXT: ret i32 return vec_all_ge(vui128a, vui128b); } + +vector signed __int128 test_vec_rl_s128(void) { + // CHECK-LABEL: @test_vec_rl_s128( + // CHECK: sub <1 x i128> + // CHECK-NEXT: lshr <1 x i128> + // CHECK-NEXT: or <1 x i128> + // CHECK-NEXT: ret <1 x i128> + return vec_rl(vsi128a, vsi128b); +} + +vector unsigned __int128 test_vec_rl_u128(void) { + // CHECK-LABEL: @test_vec_rl_u128( + // CHECK: sub <1 x i128> + // CHECK: lshr <1 x i128> + // CHECK: or <1 x i128> + // CHECK-NEXT: ret <1 x i128> + return vec_rl(vui128a, vui128b); +} + +vector signed __int128 test_vec_rlnm_s128(void) { + // CHECK-LABEL: @test_vec_rlnm_s128( + // CHECK-LE: %shuffle.i = shufflevector <16 x i8> %7, <16 x i8> %8, <16 x i32> + // CHECK-BE: %shuffle.i = shufflevector <16 x i8> %7, <16 x i8> %8, <16 x i32> + // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128> + // CHECK-NEXT: ret <1 x i128> + return vec_rlnm(vsi128a, vsi128b, vsi128c); +} + +vector unsigned __int128 test_vec_rlnm_u128(void) { + // CHECK-LABEL: @test_vec_rlnm_u128( + // CHECK-LE: %shuffle.i = shufflevector <16 x i8> %7, <16 x i8> %8, <16 x i32> + // CHECK-BE: %shuffle.i = shufflevector <16 x i8> %7, <16 x i8> %8, <16 x i32> + // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128> + // CHECK-NEXT: ret <1 x i128> + return vec_rlnm(vui128a, vui128b, vui128c); +} + +vector signed __int128 test_vec_rlmi_s128(void) { + // CHECK-LABEL: @test_vec_rlmi_s128( + // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128> + // CHECK-NEXT: ret <1 x i128> + return vec_rlmi(vsi128a, vsi128b, vsi128c); +} + +vector unsigned __int128 test_vec_rlmi_u128(void) { + // CHECK-LABEL: @test_vec_rlmi_u128( + // CHECK: call <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128> + // CHECK-NEXT: ret <1 x i128> + return vec_rlmi(vui128a, vui128b, vui128c); +} diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1057,6 +1057,15 @@ [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; +def int_ppc_altivec_vrlqnm : + PowerPC_Vec_Intrinsic<"vrlqnm", [llvm_v1i128_ty], + [llvm_v1i128_ty, llvm_v1i128_ty], + [IntrNoMem]>; +def int_ppc_altivec_vrlqmi : + PowerPC_Vec_Intrinsic<"vrlqmi", [llvm_v1i128_ty], + [llvm_v1i128_ty, llvm_v1i128_ty, llvm_v1i128_ty], + [IntrNoMem]>; + // Vector Divide Extended Intrinsics. def int_ppc_altivec_vdivesw : PowerPC_Vec_WWW_Intrinsic<"vdivesw">; def int_ppc_altivec_vdiveuw : PowerPC_Vec_WWW_Intrinsic<"vdiveuw">; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -892,6 +892,7 @@ setOperationAction(ISD::SREM, MVT::v1i128, Legal); setOperationAction(ISD::UDIV, MVT::v1i128, Legal); setOperationAction(ISD::SDIV, MVT::v1i128, Legal); + setOperationAction(ISD::ROTL, MVT::v1i128, Legal); } setOperationAction(ISD::MUL, MVT::v8i16, Legal); diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -2111,10 +2111,16 @@ "vcmpuq $BF, $vA, $vB", IIC_VecGeneral, []>; def VCMPSQ : VXForm_BF3_VAB5<321, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB), "vcmpsq $BF, $vA, $vB", IIC_VecGeneral, []>; - def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm", []>; + def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm", + [(set v1i128:$vD, + (int_ppc_altivec_vrlqnm v1i128:$vA, + v1i128:$vB))]>; def VRLQMI : VXForm_1<69, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi), - "vrlqmi $vD, $vA, $vB", IIC_VecFP, []>, + "vrlqmi $vD, $vA, $vB", IIC_VecFP, + [(set v1i128:$vD, + (int_ppc_altivec_vrlqmi v1i128:$vA, v1i128:$vB, + v1i128:$vDi))]>, RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>; def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>; @@ -2175,6 +2181,9 @@ (v1i128 (COPY_TO_REGCLASS (LXVRWX xoaddr:$src), VRRC))>; def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 64)), (v1i128 (COPY_TO_REGCLASS (LXVRDX xoaddr:$src), VRRC))>; + + def : Pat<(v1i128 (rotl v1i128:$vA, v1i128:$vB)), + (v1i128 (VRLQ v1i128:$vA, v1i128:$vB))>; } let Predicates = [IsISA3_1, HasVSX] in { diff --git a/llvm/test/CodeGen/PowerPC/p10-vector-rotate.ll b/llvm/test/CodeGen/PowerPC/p10-vector-rotate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/p10-vector-rotate.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s -check-prefixes=CHECK-LE,CHECK + +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s -check-prefixes=CHECK-BE,CHECK + +; This test case aims to test the builtins for vector rotate instructions +; on Power10. + + +define <1 x i128> @test_vrlq(<1 x i128> %x, <1 x i128> %y) { +; CHECK-LABEL: test_vrlq: +; CHECK: # %bb.0: +; CHECK-NEXT: vrlq v2, v3, v2 +; CHECK-NEXT: blr + %shl.i = shl <1 x i128> %y, %x + %sub.i = sub <1 x i128> , %x + %lshr.i = lshr <1 x i128> %y, %sub.i + %tmp = or <1 x i128> %shl.i, %lshr.i + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vrlq_cost_mult8(<1 x i128> %x) { +; CHECK-LABEL: test_vrlq_cost_mult8: +; CHECK: # %bb.0: +; CHECK: vrlq v2, v3, v2 +; CHECK-NEXT: blr + %shl.i = shl <1 x i128> , %x + %sub.i = sub <1 x i128> , %x + %lshr.i = lshr <1 x i128> , %sub.i + %tmp = or <1 x i128> %shl.i, %lshr.i + ret <1 x i128> %tmp +} + +define <1 x i128> @test_vrlq_cost_non_mult8(<1 x i128> %x) { +; CHECK-LABEL: test_vrlq_cost_non_mult8: +; CHECK: # %bb.0: +; CHECK: vrlq v2, v3, v2 +; CHECK-NEXT: blr + %shl.i = shl <1 x i128> , %x + %sub.i = sub <1 x i128> , %x + %lshr.i = lshr <1 x i128> , %sub.i + %tmp = or <1 x i128> %shl.i, %lshr.i + ret <1 x i128> %tmp +} + +; Function Attrs: nounwind readnone +define <1 x i128> @test_vrlqmi(<1 x i128> %a, <1 x i128> %b, <1 x i128> %c) { +; CHECK-LABEL: test_vrlqmi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vrlqmi v3, v2, v4 +; CHECK-NEXT: vmr v2, v3 +; CHECK-NEXT: blr +entry: + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128> %a, <1 x i128> %c, <1 x i128> %b) + ret <1 x i128> %tmp +} + +; Function Attrs: nounwind readnone +define <1 x i128> @test_vrlqnm(<1 x i128> %a, <1 x i128> %b, <1 x i128> %c) { +; CHECK-LABEL: test_vrlqnm: +; CHECK: # %bb.0: # %entry +; CHECK-BE: lxvx v5 +; CHECK-BE-NEXT: vperm v3, v3, v4, v5 +; CHECK-LE-NEXT: plxv v5 +; CHECK-LE-NEXT: vperm v3, v4, v3, v5 +; CHECK-NEXT: vrlqnm v2, v2, v3 +; CHECK-NEXT: blr +entry: + %0 = bitcast <1 x i128> %b to <16 x i8> + %1 = bitcast <1 x i128> %c to <16 x i8> + %shuffle.i = shufflevector <16 x i8> %0, <16 x i8> %1, <16 x i32> + %d = bitcast <16 x i8> %shuffle.i to <1 x i128> + %tmp = tail call <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128> %a, <1 x i128> %d) + ret <1 x i128> %tmp +} + +; Function Attrs: nounwind readnone +declare <1 x i128> @llvm.ppc.altivec.vrlqmi(<1 x i128>, <1 x i128>, <1 x i128>) + +; Function Attrs: nounwind readnone +declare <1 x i128> @llvm.ppc.altivec.vrlqnm(<1 x i128>, <1 x i128>)