diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -4466,11 +4466,14 @@ bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const { LoadSDNode *LDN = dyn_cast(N); StoreSDNode *STN = dyn_cast(N); + MemIntrinsicSDNode *MIN = dyn_cast(N); SDValue AddrOp; if (LDN) AddrOp = LDN->getOperand(1); else if (STN) AddrOp = STN->getOperand(2); + else if (MIN && MIN->getOpcode() == PPCISD::LD_SPLAT) + AddrOp = MIN->getOperand(1); // If the address points a frame object or a frame object with an offset, // we need to check the object alignment. @@ -5881,6 +5884,15 @@ if (Type != MVT::v16i8 && Type != MVT::v8i16) break; + // If the alignment for the load is 16 or bigger, we don't need the + // permutated mask to get the required value. The value must be the 0 + // element in big endian target or 7/15 in little endian target in the + // result vsx register of lvx instruction. + // Select the instruction in the .td file. + if (cast(N)->getAlign() >= Align(16) && + isOffsetMultipleOf(N, 16)) + break; + SDValue ZeroReg = CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO, Subtarget->isPPC64() ? MVT::i64 : MVT::i32); diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -158,6 +158,11 @@ def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">; def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">; +def PPCldsplatAlign16 : PatFrag<(ops node:$ptr), (PPCldsplat node:$ptr), [{ + return cast(N)->getAlign() >= Align(16) && + isOffsetMultipleOf(N, 16); +}]>; + //--------------------- VSX-specific instruction formats ---------------------// // By default, all VSX instructions are to be selected over their Altivec // counter parts and they do not have unmodeled sideeffects. @@ -3004,6 +3009,12 @@ (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $B, sub_64), $A, 1))>; def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 1)), (v2f64 (XXPERMDI $A, (SUBREG_TO_REG (i64 1), $B, sub_64), 0))>; + +// Splat loads. +def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)), + (v8i16 (VSPLTH 0, (LVX ForceXForm:$A)))>; +def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)), + (v16i8 (VSPLTB 0, (LVX ForceXForm:$A)))>; } // HasVSX, IsBigEndian // Any little endian VSX subtarget. @@ -3132,6 +3143,12 @@ (v2f64 (XXPERMDI $A, (SUBREG_TO_REG (i64 1), $B, sub_64), 0))>; def : Pat<(v2f64 (insertelt v2f64:$A, f64:$B, 1)), (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $B, sub_64), $A, 1))>; + +// Splat loads. +def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)), + (v8i16 (VSPLTH 7, (LVX ForceXForm:$A)))>; +def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)), + (v16i8 (VSPLTB 15, (LVX ForceXForm:$A)))>; } // HasVSX, IsLittleEndian // Any pre-Power9 VSX subtarget. diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -1059,47 +1059,37 @@ define <8 x i16> @test_aligned_v8i16_1(i16* %Ptr) { ; P9-LABEL: test_aligned_v8i16_1: ; P9: # %bb.0: # %entry -; P9-NEXT: lxsihzx v2, 0, r3 -; P9-NEXT: vsplth v2, v2, 3 +; P9-NEXT: lvx v2, 0, r3 +; P9-NEXT: vsplth v2, v2, 0 ; P9-NEXT: blr ; ; P8-LABEL: test_aligned_v8i16_1: ; P8: # %bb.0: # %entry -; P8-NEXT: lhzx r3, 0, r3 -; P8-NEXT: mtvsrwz v2, r3 -; P8-NEXT: vsplth v2, v2, 3 +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: vsplth v2, v2, 7 ; P8-NEXT: blr ; ; P7-LABEL: test_aligned_v8i16_1: ; P7: # %bb.0: # %entry -; P7-NEXT: li r4, 1 ; P7-NEXT: lvx v2, 0, r3 -; P7-NEXT: lvsl v4, 0, r3 -; P7-NEXT: lvx v3, r4, r3 -; P7-NEXT: vperm v2, v2, v3, v4 ; P7-NEXT: vsplth v2, v2, 0 ; P7-NEXT: blr ; ; P9-AIX32-LABEL: test_aligned_v8i16_1: ; P9-AIX32: # %bb.0: # %entry -; P9-AIX32-NEXT: lxsihzx v2, 0, r3 -; P9-AIX32-NEXT: vsplth v2, v2, 3 +; P9-AIX32-NEXT: lvx v2, 0, r3 +; P9-AIX32-NEXT: vsplth v2, v2, 0 ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: test_aligned_v8i16_1: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lhzx r3, 0, r3 -; P8-AIX32-NEXT: mtvsrwz v2, r3 -; P8-AIX32-NEXT: vsplth v2, v2, 3 +; P8-AIX32-NEXT: lvx v2, 0, r3 +; P8-AIX32-NEXT: vsplth v2, v2, 0 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test_aligned_v8i16_1: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: li r4, 1 ; P7-AIX32-NEXT: lvx v2, 0, r3 -; P7-AIX32-NEXT: lvsl v4, 0, r3 -; P7-AIX32-NEXT: lvx v3, r4, r3 -; P7-AIX32-NEXT: vperm v2, v2, v3, v4 ; P7-AIX32-NEXT: vsplth v2, v2, 0 ; P7-AIX32-NEXT: blr entry: @@ -1113,50 +1103,42 @@ ; P9-LABEL: test_aligned_v8i16_2: ; P9: # %bb.0: # %entry ; P9-NEXT: addi r3, r3, 32 -; P9-NEXT: lxsihzx v2, 0, r3 -; P9-NEXT: vsplth v2, v2, 3 +; P9-NEXT: lvx v2, 0, r3 +; P9-NEXT: vsplth v2, v2, 0 ; P9-NEXT: blr ; ; P8-LABEL: test_aligned_v8i16_2: ; P8: # %bb.0: # %entry -; P8-NEXT: lhz r3, 32(r3) -; P8-NEXT: mtvsrwz v2, r3 -; P8-NEXT: vsplth v2, v2, 3 +; P8-NEXT: addi r3, r3, 32 +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: vsplth v2, v2, 7 ; P8-NEXT: blr ; ; P7-LABEL: test_aligned_v8i16_2: ; P7: # %bb.0: # %entry -; P7-NEXT: li r4, 1 ; P7-NEXT: addi r3, r3, 32 ; P7-NEXT: lvx v2, 0, r3 -; P7-NEXT: lvx v3, r4, r3 -; P7-NEXT: lvsl v4, 0, r3 -; P7-NEXT: vperm v2, v2, v3, v4 ; P7-NEXT: vsplth v2, v2, 0 ; P7-NEXT: blr ; ; P9-AIX32-LABEL: test_aligned_v8i16_2: ; P9-AIX32: # %bb.0: # %entry ; P9-AIX32-NEXT: addi r3, r3, 32 -; P9-AIX32-NEXT: lxsihzx v2, 0, r3 -; P9-AIX32-NEXT: vsplth v2, v2, 3 +; P9-AIX32-NEXT: lvx v2, 0, r3 +; P9-AIX32-NEXT: vsplth v2, v2, 0 ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: test_aligned_v8i16_2: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lhz r3, 32(r3) -; P8-AIX32-NEXT: mtvsrwz v2, r3 -; P8-AIX32-NEXT: vsplth v2, v2, 3 +; P8-AIX32-NEXT: addi r3, r3, 32 +; P8-AIX32-NEXT: lvx v2, 0, r3 +; P8-AIX32-NEXT: vsplth v2, v2, 0 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test_aligned_v8i16_2: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: li r4, 1 ; P7-AIX32-NEXT: addi r3, r3, 32 ; P7-AIX32-NEXT: lvx v2, 0, r3 -; P7-AIX32-NEXT: lvx v3, r4, r3 -; P7-AIX32-NEXT: lvsl v4, 0, r3 -; P7-AIX32-NEXT: vperm v2, v2, v3, v4 ; P7-AIX32-NEXT: vsplth v2, v2, 0 ; P7-AIX32-NEXT: blr entry: @@ -1170,43 +1152,37 @@ define <16 x i8> @test_aligned_v16i8_1(i8* %Ptr) { ; P9-LABEL: test_aligned_v16i8_1: ; P9: # %bb.0: # %entry -; P9-NEXT: lxsibzx v2, 0, r3 -; P9-NEXT: vspltb v2, v2, 7 +; P9-NEXT: lvx v2, 0, r3 +; P9-NEXT: vspltb v2, v2, 0 ; P9-NEXT: blr ; ; P8-LABEL: test_aligned_v16i8_1: ; P8: # %bb.0: # %entry -; P8-NEXT: lbzx r3, 0, r3 -; P8-NEXT: mtvsrwz v2, r3 -; P8-NEXT: vspltb v2, v2, 7 +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: vspltb v2, v2, 15 ; P8-NEXT: blr ; ; P7-LABEL: test_aligned_v16i8_1: ; P7: # %bb.0: # %entry -; P7-NEXT: lvsl v2, 0, r3 -; P7-NEXT: lvx v3, 0, r3 -; P7-NEXT: vperm v2, v3, v3, v2 +; P7-NEXT: lvx v2, 0, r3 ; P7-NEXT: vspltb v2, v2, 0 ; P7-NEXT: blr ; ; P9-AIX32-LABEL: test_aligned_v16i8_1: ; P9-AIX32: # %bb.0: # %entry -; P9-AIX32-NEXT: lxsibzx v2, 0, r3 -; P9-AIX32-NEXT: vspltb v2, v2, 7 +; P9-AIX32-NEXT: lvx v2, 0, r3 +; P9-AIX32-NEXT: vspltb v2, v2, 0 ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: test_aligned_v16i8_1: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lbzx r3, 0, r3 -; P8-AIX32-NEXT: mtvsrwz v2, r3 -; P8-AIX32-NEXT: vspltb v2, v2, 7 +; P8-AIX32-NEXT: lvx v2, 0, r3 +; P8-AIX32-NEXT: vspltb v2, v2, 0 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test_aligned_v16i8_1: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: lvsl v2, 0, r3 -; P7-AIX32-NEXT: lvx v3, 0, r3 -; P7-AIX32-NEXT: vperm v2, v3, v3, v2 +; P7-AIX32-NEXT: lvx v2, 0, r3 ; P7-AIX32-NEXT: vspltb v2, v2, 0 ; P7-AIX32-NEXT: blr entry: @@ -1220,46 +1196,42 @@ ; P9-LABEL: test_aligned_v16i8_2: ; P9: # %bb.0: # %entry ; P9-NEXT: addi r3, r3, 16 -; P9-NEXT: lxsibzx v2, 0, r3 -; P9-NEXT: vspltb v2, v2, 7 +; P9-NEXT: lvx v2, 0, r3 +; P9-NEXT: vspltb v2, v2, 0 ; P9-NEXT: blr ; ; P8-LABEL: test_aligned_v16i8_2: ; P8: # %bb.0: # %entry -; P8-NEXT: lbz r3, 16(r3) -; P8-NEXT: mtvsrwz v2, r3 -; P8-NEXT: vspltb v2, v2, 7 +; P8-NEXT: addi r3, r3, 16 +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: vspltb v2, v2, 15 ; P8-NEXT: blr ; ; P7-LABEL: test_aligned_v16i8_2: ; P7: # %bb.0: # %entry ; P7-NEXT: addi r3, r3, 16 -; P7-NEXT: lvsl v2, 0, r3 -; P7-NEXT: lvx v3, 0, r3 -; P7-NEXT: vperm v2, v3, v3, v2 +; P7-NEXT: lvx v2, 0, r3 ; P7-NEXT: vspltb v2, v2, 0 ; P7-NEXT: blr ; ; P9-AIX32-LABEL: test_aligned_v16i8_2: ; P9-AIX32: # %bb.0: # %entry ; P9-AIX32-NEXT: addi r3, r3, 16 -; P9-AIX32-NEXT: lxsibzx v2, 0, r3 -; P9-AIX32-NEXT: vspltb v2, v2, 7 +; P9-AIX32-NEXT: lvx v2, 0, r3 +; P9-AIX32-NEXT: vspltb v2, v2, 0 ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: test_aligned_v16i8_2: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lbz r3, 16(r3) -; P8-AIX32-NEXT: mtvsrwz v2, r3 -; P8-AIX32-NEXT: vspltb v2, v2, 7 +; P8-AIX32-NEXT: addi r3, r3, 16 +; P8-AIX32-NEXT: lvx v2, 0, r3 +; P8-AIX32-NEXT: vspltb v2, v2, 0 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: test_aligned_v16i8_2: ; P7-AIX32: # %bb.0: # %entry ; P7-AIX32-NEXT: addi r3, r3, 16 -; P7-AIX32-NEXT: lvsl v2, 0, r3 -; P7-AIX32-NEXT: lvx v3, 0, r3 -; P7-AIX32-NEXT: vperm v2, v3, v3, v2 +; P7-AIX32-NEXT: lvx v2, 0, r3 ; P7-AIX32-NEXT: vspltb v2, v2, 0 ; P7-AIX32-NEXT: blr entry: