diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -4466,11 +4466,14 @@ bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const { LoadSDNode *LDN = dyn_cast(N); StoreSDNode *STN = dyn_cast(N); + MemIntrinsicSDNode *MIN = dyn_cast(N); SDValue AddrOp; if (LDN) AddrOp = LDN->getOperand(1); else if (STN) AddrOp = STN->getOperand(2); + else if (MIN && MIN->getOpcode() == PPCISD::LD_SPLAT) + AddrOp = MIN->getOperand(1); // If the address points a frame object or a frame object with an offset, // we need to check the object alignment. @@ -5881,6 +5884,15 @@ if (Type != MVT::v16i8 && Type != MVT::v8i16) break; + // If the alignment for the load is 16 or bigger, we don't need the + // permutated mask to get the required value. The value must be the 0 + // element in big endian target or 7/15 in little endian target in the + // result vsx register of lvx instruction. + // Select the instruction in the .td file. + if (cast(N)->getAlign() >= Align(16) && + isOffsetMultipleOf(N, 16)) + break; + SDValue ZeroReg = CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO, Subtarget->isPPC64() ? MVT::i64 : MVT::i32); diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -158,6 +158,11 @@ def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">; def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">; +def PPCldsplatAlign16 : PatFrag<(ops node:$ptr), (PPCldsplat node:$ptr), [{ + return cast(N)->getAlign() >= Align(16) && + isOffsetMultipleOf(N, 16); +}]>; + //--------------------- VSX-specific instruction formats ---------------------// // By default, all VSX instructions are to be selected over their Altivec // counter parts and they do not have unmodeled sideeffects. @@ -3175,6 +3180,12 @@ v2f64, (f64 (load ForceXForm:$src)), (XXPERMDIs (XFLOADf64 ForceXForm:$src), 2), (SUBREG_TO_REG (i64 1), (XFLOADf64 ForceXForm:$src), sub_64)>; + +// Splat loads. +def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)), + (v8i16 (VSPLTH 7, (LVX ForceXForm:$A)))>; +def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)), + (v16i8 (VSPLTB 15, (LVX ForceXForm:$A)))>; } // HasVSX, NoP9Vector, IsLittleEndian let Predicates = [HasVSX, NoP9Vector, IsBigEndian] in { @@ -3182,6 +3193,12 @@ (LXVD2X ForceXForm:$src)>; def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, ForceXForm:$dst), (STXVD2X $rS, ForceXForm:$dst)>; + + // Splat loads. + def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)), + (v8i16 (VSPLTH 0, (LVX ForceXForm:$A)))>; + def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)), + (v16i8 (VSPLTB 0, (LVX ForceXForm:$A)))>; } // HasVSX, NoP9Vector, IsBigEndian // Any VSX subtarget that only has loads and stores that load in big endian diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -1065,18 +1065,13 @@ ; ; P8-LABEL: test_aligned_v8i16_1: ; P8: # %bb.0: # %entry -; P8-NEXT: lhzx r3, 0, r3 -; P8-NEXT: mtvsrwz v2, r3 -; P8-NEXT: vsplth v2, v2, 3 +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: vsplth v2, v2, 7 ; P8-NEXT: blr ; ; P7-LABEL: test_aligned_v8i16_1: ; P7: # %bb.0: # %entry -; P7-NEXT: li r4, 1 ; P7-NEXT: lvx v2, 0, r3 -; P7-NEXT: lvsl v4, 0, r3 -; P7-NEXT: lvx v3, r4, r3 -; P7-NEXT: vperm v2, v2, v3, v4 ; P7-NEXT: vsplth v2, v2, 0 ; P7-NEXT: blr ; @@ -1088,18 +1083,13 @@ ; ; P8-AIX32-LABEL: test_aligned_v8i16_1: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lhzx r3, 0, r3 -; P8-AIX32-NEXT: mtvsrwz v2, r3 -; P8-AIX32-NEXT: vsplth v2, v2, 3 +; P8-AIX32-NEXT: lvx v2, 0, r3 +; P8-AIX32-NEXT: vsplth v2, v2, 0 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test_aligned_v8i16_1: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: li r4, 1 ; P7-AIX32-NEXT: lvx v2, 0, r3 -; P7-AIX32-NEXT: lvsl v4, 0, r3 -; P7-AIX32-NEXT: lvx v3, r4, r3 -; P7-AIX32-NEXT: vperm v2, v2, v3, v4 ; P7-AIX32-NEXT: vsplth v2, v2, 0 ; P7-AIX32-NEXT: blr entry: @@ -1119,19 +1109,15 @@ ; ; P8-LABEL: test_aligned_v8i16_2: ; P8: # %bb.0: # %entry -; P8-NEXT: lhz r3, 32(r3) -; P8-NEXT: mtvsrwz v2, r3 -; P8-NEXT: vsplth v2, v2, 3 +; P8-NEXT: addi r3, r3, 32 +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: vsplth v2, v2, 7 ; P8-NEXT: blr ; ; P7-LABEL: test_aligned_v8i16_2: ; P7: # %bb.0: # %entry -; P7-NEXT: li r4, 1 ; P7-NEXT: addi r3, r3, 32 ; P7-NEXT: lvx v2, 0, r3 -; P7-NEXT: lvx v3, r4, r3 -; P7-NEXT: lvsl v4, 0, r3 -; P7-NEXT: vperm v2, v2, v3, v4 ; P7-NEXT: vsplth v2, v2, 0 ; P7-NEXT: blr ; @@ -1144,19 +1130,15 @@ ; ; P8-AIX32-LABEL: test_aligned_v8i16_2: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lhz r3, 32(r3) -; P8-AIX32-NEXT: mtvsrwz v2, r3 -; P8-AIX32-NEXT: vsplth v2, v2, 3 +; P8-AIX32-NEXT: addi r3, r3, 32 +; P8-AIX32-NEXT: lvx v2, 0, r3 +; P8-AIX32-NEXT: vsplth v2, v2, 0 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test_aligned_v8i16_2: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: li r4, 1 ; P7-AIX32-NEXT: addi r3, r3, 32 ; P7-AIX32-NEXT: lvx v2, 0, r3 -; P7-AIX32-NEXT: lvx v3, r4, r3 -; P7-AIX32-NEXT: lvsl v4, 0, r3 -; P7-AIX32-NEXT: vperm v2, v2, v3, v4 ; P7-AIX32-NEXT: vsplth v2, v2, 0 ; P7-AIX32-NEXT: blr entry: @@ -1176,16 +1158,13 @@ ; ; P8-LABEL: test_aligned_v16i8_1: ; P8: # %bb.0: # %entry -; P8-NEXT: lbzx r3, 0, r3 -; P8-NEXT: mtvsrwz v2, r3 -; P8-NEXT: vspltb v2, v2, 7 +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: vspltb v2, v2, 15 ; P8-NEXT: blr ; ; P7-LABEL: test_aligned_v16i8_1: ; P7: # %bb.0: # %entry -; P7-NEXT: lvsl v2, 0, r3 -; P7-NEXT: lvx v3, 0, r3 -; P7-NEXT: vperm v2, v3, v3, v2 +; P7-NEXT: lvx v2, 0, r3 ; P7-NEXT: vspltb v2, v2, 0 ; P7-NEXT: blr ; @@ -1197,16 +1176,13 @@ ; ; P8-AIX32-LABEL: test_aligned_v16i8_1: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lbzx r3, 0, r3 -; P8-AIX32-NEXT: mtvsrwz v2, r3 -; P8-AIX32-NEXT: vspltb v2, v2, 7 +; P8-AIX32-NEXT: lvx v2, 0, r3 +; P8-AIX32-NEXT: vspltb v2, v2, 0 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test_aligned_v16i8_1: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: lvsl v2, 0, r3 -; P7-AIX32-NEXT: lvx v3, 0, r3 -; P7-AIX32-NEXT: vperm v2, v3, v3, v2 +; P7-AIX32-NEXT: lvx v2, 0, r3 ; P7-AIX32-NEXT: vspltb v2, v2, 0 ; P7-AIX32-NEXT: blr entry: @@ -1226,17 +1202,15 @@ ; ; P8-LABEL: test_aligned_v16i8_2: ; P8: # %bb.0: # %entry -; P8-NEXT: lbz r3, 16(r3) -; P8-NEXT: mtvsrwz v2, r3 -; P8-NEXT: vspltb v2, v2, 7 +; P8-NEXT: addi r3, r3, 16 +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: vspltb v2, v2, 15 ; P8-NEXT: blr ; ; P7-LABEL: test_aligned_v16i8_2: ; P7: # %bb.0: # %entry ; P7-NEXT: addi r3, r3, 16 -; P7-NEXT: lvsl v2, 0, r3 -; P7-NEXT: lvx v3, 0, r3 -; P7-NEXT: vperm v2, v3, v3, v2 +; P7-NEXT: lvx v2, 0, r3 ; P7-NEXT: vspltb v2, v2, 0 ; P7-NEXT: blr ; @@ -1249,17 +1223,15 @@ ; ; P8-AIX32-LABEL: test_aligned_v16i8_2: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lbz r3, 16(r3) -; P8-AIX32-NEXT: mtvsrwz v2, r3 -; P8-AIX32-NEXT: vspltb v2, v2, 7 +; P8-AIX32-NEXT: addi r3, r3, 16 +; P8-AIX32-NEXT: lvx v2, 0, r3 +; P8-AIX32-NEXT: vspltb v2, v2, 0 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test_aligned_v16i8_2: ; P7-AIX32: # %bb.0: # %entry ; P7-AIX32-NEXT: addi r3, r3, 16 -; P7-AIX32-NEXT: lvsl v2, 0, r3 -; P7-AIX32-NEXT: lvx v3, 0, r3 -; P7-AIX32-NEXT: vperm v2, v3, v3, v2 +; P7-AIX32-NEXT: lvx v2, 0, r3 ; P7-AIX32-NEXT: vspltb v2, v2, 0 ; P7-AIX32-NEXT: blr entry: