diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h --- a/clang/lib/Headers/altivec.h +++ b/clang/lib/Headers/altivec.h @@ -16509,6 +16509,54 @@ #define vec_xl_be vec_xl #endif +#if defined(__POWER10_VECTOR) && defined(__VSX) + +/* vect_xl_sext */ + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_sext(signed long long __offset, signed char *__pointer) { + return (unaligned_vec_si128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_sext(signed long long __offset, signed short *__pointer) { + return (unaligned_vec_si128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_sext(signed long long __offset, signed int *__pointer) { + return (unaligned_vec_si128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_sext(signed long long __offset, signed long long *__pointer) { + return (unaligned_vec_si128)*(__pointer + __offset); +} + +/* vec_xl_zext */ + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_zext(signed long long __offset, unsigned char *__pointer) { + return (unaligned_vec_ui128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_zext(signed long long __offset, unsigned short *__pointer) { + return (unaligned_vec_ui128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_zext(signed long long __offset, unsigned int *__pointer) { + return (unaligned_vec_ui128)*(__pointer + __offset); +} + +static __inline__ vector unsigned __int128 __ATTRS_o_ai +vec_xl_zext(signed long long __offset, unsigned long long *__pointer) { + return (unaligned_vec_ui128)*(__pointer + __offset); +} + +#endif + /* vec_xst */ static inline __ATTRS_o_ai void vec_xst(vector signed char __vec, diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c --- a/clang/test/CodeGen/builtins-ppc-p10vector.c +++ b/clang/test/CodeGen/builtins-ppc-p10vector.c @@ -9,8 +9,16 @@ vector unsigned char vuca; vector unsigned short vusa; vector unsigned int vuia; +signed char *ca; +unsigned char *uca; + vector unsigned long long vulla, vullb; -unsigned int uia; +signed int *ia; +unsigned int uia, *uiap; +signed short *sia; +unsigned short *usia; +signed long long *llia, llib; +unsigned long long *ullia; vector unsigned long long test_vpdepd(void) { // CHECK: @llvm.ppc.altivec.vpdepd(<2 x i64> @@ -79,3 +87,59 @@ // CHECK-LE-NEXT: ret <16 x i8> return vec_clrr(vuca, uia); } + +vector signed __int128 test_vec_xl_sext_i8(void) { + // CHECK: load i8 + // CHECK: sext i8 + // CHECK: ret <1 x i128> + return vec_xl_sext(llib, ca); +} + +vector signed __int128 test_vec_xl_sext_i16(void) { + // CHECK: load i16 + // CHECK: sext i16 + // CHECK: ret <1 x i128> + return vec_xl_sext(llib, sia); +} + +vector signed __int128 test_vec_xl_sext_i32(void) { + // CHECK: load i32 + // CHECK: sext i32 + // CHECK: ret <1 x i128> + return vec_xl_sext(llib, ia); +} + +vector signed __int128 test_vec_xl_sext_i64(void) { + // CHECK: load i64 + // CHECK: sext i64 + // CHECK: ret <1 x i128> + return vec_xl_sext(llib, llia); +} + +vector unsigned __int128 test_vec_xl_zext_i8(void) { + // CHECK: load i8 + // CHECK: zext i8 + // CHECK: ret <1 x i128> + return vec_xl_zext(llib, uca); +} + +vector unsigned __int128 test_vec_xl_zext_i16(void) { + // CHECK: load i16 + // CHECK: zext i16 + // CHECK: ret <1 x i128> + return vec_xl_zext(llib, usia); +} + +vector unsigned __int128 test_vec_xl_zext_i32(void) { + // CHECK: load i32 + // CHECK: zext i32 + // CHECK: ret <1 x i128> + return vec_xl_zext(llib, uiap); +} + +vector unsigned __int128 test_vec_xl_zext_i64(void) { + // CHECK: load i64 + // CHECK: zext i64 + // CHECK: ret <1 x i128> + return vec_xl_zext(llib, ullia); +} diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -480,6 +480,12 @@ /// an xxswapd. LXVD2X, + /// LXVRZX - Load VSX Vector Rightmost and Zero Extend + /// This node represents v1i128 BUILD_VECTOR of a zero extending load + /// instruction from to i128. + /// Allows utilization of the Load VSX Vector Rightmost Instructions. + LXVRZX, + /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian. /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on /// the vector type to load vector in big-endian element order. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1573,6 +1573,7 @@ case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR"; case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; case PPCISD::FNMSUB: return "PPCISD::FNMSUB"; + case PPCISD::LXVRZX: return "PPCISD::LXVRZX"; } return nullptr; } @@ -13760,6 +13761,48 @@ return SDValue(); } +// Look for the pattern of a load from a narrow width to i128, feeding +// into a BUILD_VECTOR of v1i128. Replace this sequence with a PPCISD node +// (LXVRZX). This node represents a zero extending load that will be matched +// to the Load VSX Vector Rightmost instructions. +static SDValue combineBVZEXTLOAD(SDNode *N, SelectionDAG &DAG) { + SDLoc dl(N); + + // This combine is only eligible for a BUILD_VECTOR of v1i128. + // Other return types are not valid for the LXVRZX replacement. + if (N->getValueType(0) != MVT::v1i128) + return SDValue(); + + SDValue Operand = N->getOperand(0); + // Proceed with the transformation if the operand to the BUILD_VECTOR + // is a load instruction. + if (Operand.getOpcode() != ISD::LOAD) + return SDValue(); + + LoadSDNode *LD = dyn_cast(Operand); + EVT MemoryType = LD->getMemoryVT(); + + // This transformation is only valid if the we are loading either a byte, + // halfword, word, or doubleword. + bool ValidLDType = MemoryType == MVT::i8 || MemoryType == MVT::i16 || + MemoryType == MVT::i32 || MemoryType == MVT::i64; + + // Ensure that the load from the narrow width is being zero extended to i128. + if ((!ValidLDType) || (LD->getValueType(0) != MVT::i128) || + (LD->getExtensionType() != ISD::ZEXTLOAD)) + return SDValue(); + + // The width of the narrow type becomes an operand of the LXVRZX node + // we are creating in order to pattern match to the appropriate instruction + // in the backend. + SDValue Width = DAG.getIntPtrConstant(MemoryType.getScalarSizeInBits(), dl); + SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr(), Width}; + + return DAG.getMemIntrinsicNode(PPCISD::LXVRZX, dl, + DAG.getVTList(MVT::v1i128, MVT::Other), + LoadOps, MemoryType, LD->getMemOperand()); +} + SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::BUILD_VECTOR && @@ -13797,6 +13840,14 @@ return Reduced; } + // On Power10, the Load VSX Vector Rightmost instructions can be utilized + // if this is a BUILD_VECTOR of v1i128, and if the operand to the BUILD_VECTOR + // is a load from to i128. + if (Subtarget.isISA3_1()) { + SDValue BVOfZLoad = combineBVZEXTLOAD(N, DAG); + if (BVOfZLoad) + return BVOfZLoad; + } if (N->getValueType(0) != MVT::v2f64) return SDValue(); diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -209,6 +209,21 @@ def PrefixInstrs : Predicate<"PPCSubTarget->hasPrefixInstrs()">; def IsISA3_1 : Predicate<"PPCSubTarget->isISA3_1()">; +let mayLoad = 1, mayStore = 0, Predicates = [IsISA3_1] in { + // The XFormMemOp flag is set on the instruction format. + def LXVRBX : X_XT6_RA5_RB5<31, 13, "lxvrbx", vsrc, []>; + def LXVRHX : X_XT6_RA5_RB5<31, 45, "lxvrhx", vsrc, []>; + def LXVRWX : X_XT6_RA5_RB5<31, 77, "lxvrwx", vsrc, []>; + def LXVRDX : X_XT6_RA5_RB5<31, 109, "lxvrdx", vsrc, []>; +} + +def SDT_PPCLXVRZX : SDTypeProfile<1, 2, [ + SDTCisVT<0, v1i128>, SDTCisPtrTy<1>, SDTCisPtrTy<2> +]>; + +def PPClxvrzx : SDNode<"PPCISD::LXVRZX", SDT_PPCLXVRZX, + [SDNPHasChain, SDNPMayLoad]>; + let Predicates = [PrefixInstrs] in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { defm PADDI8 : @@ -512,7 +527,7 @@ (PPCmatpcreladdr pcreladdr:$dst), 8), (PSTXSDpc (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC), $dst, 0)>; - // If the PPCmatpcreladdr node is not caught by any other pattern it should be + // If the PPCmatpcreladdr node is not caught by any other pattern it should be // caught here and turned into a paddi instruction to materialize the address. def : Pat<(PPCmatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>; } @@ -564,4 +579,15 @@ (v4i32 (COPY_TO_REGCLASS (XXGENPCVWM $VRB, imm:$IMM), VRRC))>; def : Pat<(v2i64 (int_ppc_vsx_xxgenpcvdm v2i64:$VRB, imm:$IMM)), (v2i64 (COPY_TO_REGCLASS (XXGENPCVDM $VRB, imm:$IMM), VRRC))>; + + // Utilize the appropriate Load VSX Vector Rightmost instruction depending + // on the width of PPClxvrzx. + def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 8)), + (v1i128 (COPY_TO_REGCLASS (LXVRBX xoaddr:$src), VRRC))>; + def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 16)), + (v1i128 (COPY_TO_REGCLASS (LXVRHX xoaddr:$src), VRRC))>; + def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 32)), + (v1i128 (COPY_TO_REGCLASS (LXVRWX xoaddr:$src), VRRC))>; + def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 64)), + (v1i128 (COPY_TO_REGCLASS (LXVRDX xoaddr:$src), VRRC))>; } diff --git a/llvm/test/CodeGen/PowerPC/p10-vsx-builtins.ll b/llvm/test/CodeGen/PowerPC/p10-vsx-builtins.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/p10-vsx-builtins.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; These test cases tests that zero extending loads utilize the Load VSX Vector Rightmost + +; (lxvr[b|h|w|d]x) instructions in Power10. +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s + +; Function Attrs: norecurse nounwind readonly +define dso_local <1 x i128> @vec_xl_zext(i64 %__offset, i8* nocapture readonly %__pointer) local_unnamed_addr #0 { +; CHECK-LABEL: vec_xl_zext: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvrbx v2, r4, r3 +; CHECK-NEXT: blr +entry: + %add.ptr = getelementptr inbounds i8, i8* %__pointer, i64 %__offset + %0 = load i8, i8* %add.ptr, align 1 + %conv = zext i8 %0 to i128 + %splat.splatinsert = insertelement <1 x i128> undef, i128 %conv, i32 0 + ret <1 x i128> %splat.splatinsert +} + +; Function Attrs: norecurse nounwind readonly +define dso_local <1 x i128> @vec_xl_zext_short(i64 %__offset, i16* nocapture readonly %__pointer) local_unnamed_addr #0 { +; CHECK-LABEL: vec_xl_zext_short: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r3, 1 +; CHECK-NEXT: lxvrhx v2, r4, r3 +; CHECK-NEXT: blr +entry: + %add.ptr = getelementptr inbounds i16, i16* %__pointer, i64 %__offset + %0 = load i16, i16* %add.ptr, align 2 + %conv = zext i16 %0 to i128 + %splat.splatinsert = insertelement <1 x i128> undef, i128 %conv, i32 0 + ret <1 x i128> %splat.splatinsert +} + +; Function Attrs: norecurse nounwind readonly +define dso_local <1 x i128> @vec_xl_zext_word(i64 %__offset, i32* nocapture readonly %__pointer) local_unnamed_addr #0 { +; CHECK-LABEL: vec_xl_zext_word: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r3, 2 +; CHECK-NEXT: lxvrwx v2, r4, r3 +; CHECK-NEXT: blr +entry: + %add.ptr = getelementptr inbounds i32, i32* %__pointer, i64 %__offset + %0 = load i32, i32* %add.ptr, align 4 + %conv = zext i32 %0 to i128 + %splat.splatinsert = insertelement <1 x i128> undef, i128 %conv, i32 0 + ret <1 x i128> %splat.splatinsert +} + +; Function Attrs: norecurse nounwind readonly +define dso_local <1 x i128> @vec_xl_zext_dw(i64 %__offset, i64* nocapture readonly %__pointer) local_unnamed_addr #0 { +; CHECK-LABEL: vec_xl_zext_dw: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sldi r3, r3, 3 +; CHECK-NEXT: lxvrdx v2, r4, r3 +; CHECK-NEXT: blr +entry: + %add.ptr = getelementptr inbounds i64, i64* %__pointer, i64 %__offset + %0 = load i64, i64* %add.ptr, align 8 + %conv = zext i64 %0 to i128 + %splat.splatinsert = insertelement <1 x i128> undef, i128 %conv, i32 0 + ret <1 x i128> %splat.splatinsert +} + diff --git a/llvm/test/MC/Disassembler/PowerPC/p10insts.txt b/llvm/test/MC/Disassembler/PowerPC/p10insts.txt --- a/llvm/test/MC/Disassembler/PowerPC/p10insts.txt +++ b/llvm/test/MC/Disassembler/PowerPC/p10insts.txt @@ -30,3 +30,15 @@ # CHECK: vclrrb 1, 4, 3 0x10 0x24 0x19 0xcd + +# CHECK: lxvrbx 32, 1, 2 +0x7c 0x01 0x10 0x1b + +# CHECK: lxvrhx 33, 1, 2 +0x7c 0x21 0x10 0x5b + +# CHECK: lxvrdx 34, 1, 2 +0x7c 0x41 0x10 0xdb + +# CHECK: lxvrwx 35, 1, 2 +0x7c 0x61 0x10 0x9b diff --git a/llvm/test/MC/PowerPC/p10.s b/llvm/test/MC/PowerPC/p10.s --- a/llvm/test/MC/PowerPC/p10.s +++ b/llvm/test/MC/PowerPC/p10.s @@ -33,3 +33,15 @@ # CHECK-BE: vclrrb 1, 4, 3 # encoding: [0x10,0x24,0x19,0xcd] # CHECK-LE: vclrrb 1, 4, 3 # encoding: [0xcd,0x19,0x24,0x10] vclrrb 1, 4, 3 +# CHECK-BE: lxvrbx 32, 1, 2 # encoding: [0x7c,0x01,0x10,0x1b] +# CHECK-LE: lxvrbx 32, 1, 2 # encoding: [0x1b,0x10,0x01,0x7c] + lxvrbx 32, 1, 2 +# CHECK-BE: lxvrhx 33, 1, 2 # encoding: [0x7c,0x21,0x10,0x5b] +# CHECK-LE: lxvrhx 33, 1, 2 # encoding: [0x5b,0x10,0x21,0x7c] + lxvrhx 33, 1, 2 +# CHECK-BE: lxvrdx 34, 1, 2 # encoding: [0x7c,0x41,0x10,0xdb] +# CHECK-LE: lxvrdx 34, 1, 2 # encoding: [0xdb,0x10,0x41,0x7c] + lxvrdx 34, 1, 2 +# CHECK-BE: lxvrwx 35, 1, 2 # encoding: [0x7c,0x61,0x10,0x9b] +# CHECK-LE: lxvrwx 35, 1, 2 # encoding: [0x9b,0x10,0x61,0x7c] + lxvrwx 35, 1, 2