Index: llvm/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -456,6 +456,11 @@ /// an xxswapd. LXVD2X, + /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian. + /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on + /// the vector type to load vector in big-endian element order. + LOAD_VEC_BE, + /// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a /// v2f32 value into the lower half of a VSR register. LD_VSX_LH, @@ -465,6 +470,11 @@ /// an xxswapd. STXVD2X, + /// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian. + /// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on + /// the vector type to store vector in big-endian element order. + STORE_VEC_BE, + /// Store scalar integers from VSR. ST_VSR_SCAL_INT, @@ -1167,6 +1177,8 @@ SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase, + DAGCombinerInfo &DCI) const; /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces /// SETCC with integer subtraction when (1) there is a legal way of doing it Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1118,6 +1118,8 @@ setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::TRUNCATE); + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + if (Subtarget.useCRBits()) { setTargetDAGCombine(ISD::TRUNCATE); @@ -1352,6 +1354,8 @@ case PPCISD::SExtVElems: return "PPCISD::SExtVElems"; case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; + case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE"; + case PPCISD::STORE_VEC_BE: return "PPCISD::STORE_VEC_BE"; case PPCISD::ST_VSR_SCAL_INT: return "PPCISD::ST_VSR_SCAL_INT"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; @@ -13113,6 +13117,60 @@ return Val; } +SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN, + LSBaseSDNode *LSBase, + DAGCombinerInfo &DCI) const { + assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) && + "Not a reverse memop pattern!"); + + auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool { + auto Mask = SVN->getMask(); + int i = 0; + auto I = Mask.rbegin(); + auto E = Mask.rend(); + + for (; I != E; ++I) { + if (*I != i) + return false; + i++; + } + return true; + }; + + SelectionDAG &DAG = DCI.DAG; + EVT VT = SVN->getValueType(0); + + if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX()) + return SDValue(); + + // Before P9, we don't have vector load/store instrs in big-endian + // element order for v8i16 or v16i8 + if (!Subtarget.hasP9Vector() && (VT == MVT::v8i16 || VT == MVT::v16i8)) + return SDValue(); + + if(!IsElementReverse(SVN)) + return SDValue(); + + if (LSBase->getOpcode() == ISD::LOAD) { + SDLoc dl(SVN); + SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()}; + return DAG.getMemIntrinsicNode( + PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps, + LSBase->getMemoryVT(), LSBase->getMemOperand()); + } + + if (LSBase->getOpcode() == ISD::STORE) { + SDLoc dl(LSBase); + SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0), + LSBase->getBasePtr()}; + return DAG.getMemIntrinsicNode( + PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps, + LSBase->getMemoryVT(), LSBase->getMemOperand()); + } + + llvm_unreachable("Expected a load or store node here"); +} + SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -13159,6 +13217,12 @@ case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return combineFPToIntToFP(N, DCI); + case ISD::VECTOR_SHUFFLE: + if (ISD::isNormalLoad(N->getOperand(0).getNode())) { + LSBaseSDNode* LSBase = cast(N->getOperand(0)); + return combineVReverseMemOP(cast(N), LSBase, DCI); + } + break; case ISD::STORE: { EVT Op1VT = N->getOperand(1).getValueType(); @@ -13170,6 +13234,13 @@ return Val; } + if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) { + ShuffleVectorSDNode *SVN = cast(N->getOperand(1)); + SDValue Val= combineVReverseMemOP(SVN, cast(N), DCI); + if (Val) + return Val; + } + // Turn STORE (BSWAP) -> sthbrx/stwbrx. if (cast(N)->isUnindexed() && Opcode == ISD::BSWAP && N->getOperand(1).getNode()->hasOneUse() && Index: llvm/lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -78,12 +78,21 @@ def SDTVabsd : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32> ]>; - +def SDT_PPCld_vec_be : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisPtrTy<1> +]>; +def SDT_PPCst_vec_be : SDTypeProfile<0, 2, [ + SDTCisVec<0>, SDTCisPtrTy<1> +]>; def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x, [SDNPHasChain, SDNPMayStore]>; +def PPCld_vec_be : SDNode<"PPCISD::LOAD_VEC_BE", SDT_PPCld_vec_be, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def PPCst_vec_be : SDNode<"PPCISD::STORE_VEC_BE", SDT_PPCst_vec_be, + [SDNPHasChain, SDNPMayStore]>; def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>; def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>; def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>; @@ -1088,6 +1097,19 @@ (STXVD2X $rS, xoaddr:$dst)>; def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; } + +// Load vector big endian order +let Predicates = [IsLittleEndian, HasVSX] in { + def : Pat<(v2f64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>; + def : Pat<(PPCst_vec_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; + def : Pat<(v4f32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>; + def : Pat<(PPCst_vec_be v4f32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>; + def : Pat<(v2i64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>; + def : Pat<(PPCst_vec_be v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; + def : Pat<(v4i32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>; + def : Pat<(PPCst_vec_be v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>; +} + let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in { def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; @@ -3024,6 +3046,16 @@ (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>; def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)), (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>; + + def : Pat<(v8i16 (PPCld_vec_be xoaddr:$src)), + (COPY_TO_REGCLASS (LXVH8X xoaddr:$src), VRRC)>; + def : Pat<(PPCst_vec_be v8i16:$rS, xoaddr:$dst), + (STXVH8X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>; + + def : Pat<(v16i8 (PPCld_vec_be xoaddr:$src)), + (COPY_TO_REGCLASS (LXVB16X xoaddr:$src), VRRC)>; + def : Pat<(PPCst_vec_be v16i8:$rS, xoaddr:$dst), + (STXVB16X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>; } // IsLittleEndian, HasP9Vector let Predicates = [IsBigEndian, HasP9Vector] in { Index: llvm/test/CodeGen/PowerPC/build-vector-tests.ll =================================================================== --- llvm/test/CodeGen/PowerPC/build-vector-tests.ll +++ llvm/test/CodeGen/PowerPC/build-vector-tests.ll @@ -986,11 +986,7 @@ ; ; P9LE-LABEL: fromDiffMemConsDi: ; P9LE: # %bb.0: # %entry -; P9LE-NEXT: lxv v2, 0(r3) -; P9LE-NEXT: addis r3, r2, .LCPI8_0@toc@ha -; P9LE-NEXT: addi r3, r3, .LCPI8_0@toc@l -; P9LE-NEXT: lxvx v3, 0, r3 -; P9LE-NEXT: vperm v2, v2, v2, v3 +; P9LE-NEXT: lxvw4x v2, 0, r3 ; P9LE-NEXT: blr ; ; P8BE-LABEL: fromDiffMemConsDi: @@ -1004,12 +1000,7 @@ ; ; P8LE-LABEL: fromDiffMemConsDi: ; P8LE: # %bb.0: # %entry -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: addis r4, r2, .LCPI8_0@toc@ha -; P8LE-NEXT: addi r3, r4, .LCPI8_0@toc@l -; P8LE-NEXT: lvx v2, 0, r3 -; P8LE-NEXT: xxswapd v3, vs0 -; P8LE-NEXT: vperm v2, v3, v3, v2 +; P8LE-NEXT: lxvw4x v2, 0, r3 ; P8LE-NEXT: blr entry: %arrayidx = getelementptr inbounds i32, i32* %arr, i64 3 @@ -2570,11 +2561,7 @@ ; ; P9LE-LABEL: fromDiffMemConsDui: ; P9LE: # %bb.0: # %entry -; P9LE-NEXT: lxv v2, 0(r3) -; P9LE-NEXT: addis r3, r2, .LCPI41_0@toc@ha -; P9LE-NEXT: addi r3, r3, .LCPI41_0@toc@l -; P9LE-NEXT: lxvx v3, 0, r3 -; P9LE-NEXT: vperm v2, v2, v2, v3 +; P9LE-NEXT: lxvw4x v2, 0, r3 ; P9LE-NEXT: blr ; ; P8BE-LABEL: fromDiffMemConsDui: @@ -2588,12 +2575,7 @@ ; ; P8LE-LABEL: fromDiffMemConsDui: ; P8LE: # %bb.0: # %entry -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: addis r4, r2, .LCPI41_0@toc@ha -; P8LE-NEXT: addi r3, r4, .LCPI41_0@toc@l -; P8LE-NEXT: lvx v2, 0, r3 -; P8LE-NEXT: xxswapd v3, vs0 -; P8LE-NEXT: vperm v2, v3, v3, v2 +; P8LE-NEXT: lxvw4x v2, 0, r3 ; P8LE-NEXT: blr entry: %arrayidx = getelementptr inbounds i32, i32* %arr, i64 3 @@ -4155,8 +4137,8 @@ ; ; P9LE-LABEL: fromDiffMemConsDll: ; P9LE: # %bb.0: # %entry -; P9LE-NEXT: lxv v2, 16(r3) -; P9LE-NEXT: xxswapd v2, v2 +; P9LE-NEXT: addi r3, r3, 16 +; P9LE-NEXT: lxvd2x v2, 0, r3 ; P9LE-NEXT: blr ; ; P8BE-LABEL: fromDiffMemConsDll: @@ -4235,9 +4217,8 @@ ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: sldi r4, r4, 3 ; P9LE-NEXT: add r3, r3, r4 -; P9LE-NEXT: li r4, -8 -; P9LE-NEXT: lxvx v2, r3, r4 -; P9LE-NEXT: xxswapd v2, v2 +; P9LE-NEXT: addi r3, r3, -8 +; P9LE-NEXT: lxvd2x v2, 0, r3 ; P9LE-NEXT: blr ; ; P8BE-LABEL: fromDiffMemVarDll: @@ -4948,8 +4929,8 @@ ; ; P9LE-LABEL: fromDiffMemConsDConvdtoll: ; P9LE: # %bb.0: # %entry -; P9LE-NEXT: lxv vs0, 16(r3) -; P9LE-NEXT: xxswapd vs0, vs0 +; P9LE-NEXT: addi r3, r3, 16 +; P9LE-NEXT: lxvd2x vs0, 0, r3 ; P9LE-NEXT: xvcvdpsxds v2, vs0 ; P9LE-NEXT: blr ; @@ -5040,9 +5021,8 @@ ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: sldi r4, r4, 3 ; P9LE-NEXT: add r3, r3, r4 -; P9LE-NEXT: li r4, -8 -; P9LE-NEXT: lxvx vs0, r3, r4 -; P9LE-NEXT: xxswapd vs0, vs0 +; P9LE-NEXT: addi r3, r3, -8 +; P9LE-NEXT: lxvd2x vs0, 0, r3 ; P9LE-NEXT: xvcvdpsxds v2, vs0 ; P9LE-NEXT: blr ; @@ -5402,8 +5382,8 @@ ; ; P9LE-LABEL: fromDiffMemConsDull: ; P9LE: # %bb.0: # %entry -; P9LE-NEXT: lxv v2, 16(r3) -; P9LE-NEXT: xxswapd v2, v2 +; P9LE-NEXT: addi r3, r3, 16 +; P9LE-NEXT: lxvd2x v2, 0, r3 ; P9LE-NEXT: blr ; ; P8BE-LABEL: fromDiffMemConsDull: @@ -5482,9 +5462,8 @@ ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: sldi r4, r4, 3 ; P9LE-NEXT: add r3, r3, r4 -; P9LE-NEXT: li r4, -8 -; P9LE-NEXT: lxvx v2, r3, r4 -; P9LE-NEXT: xxswapd v2, v2 +; P9LE-NEXT: addi r3, r3, -8 +; P9LE-NEXT: lxvd2x v2, 0, r3 ; P9LE-NEXT: blr ; ; P8BE-LABEL: fromDiffMemVarDull: @@ -6195,8 +6174,8 @@ ; ; P9LE-LABEL: fromDiffMemConsDConvdtoull: ; P9LE: # %bb.0: # %entry -; P9LE-NEXT: lxv vs0, 16(r3) -; P9LE-NEXT: xxswapd vs0, vs0 +; P9LE-NEXT: addi r3, r3, 16 +; P9LE-NEXT: lxvd2x vs0, 0, r3 ; P9LE-NEXT: xvcvdpuxds v2, vs0 ; P9LE-NEXT: blr ; @@ -6287,9 +6266,8 @@ ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: sldi r4, r4, 3 ; P9LE-NEXT: add r3, r3, r4 -; P9LE-NEXT: li r4, -8 -; P9LE-NEXT: lxvx vs0, r3, r4 -; P9LE-NEXT: xxswapd vs0, vs0 +; P9LE-NEXT: addi r3, r3, -8 +; P9LE-NEXT: lxvd2x vs0, 0, r3 ; P9LE-NEXT: xvcvdpuxds v2, vs0 ; P9LE-NEXT: blr ; Index: llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll =================================================================== --- llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll +++ llvm/test/CodeGen/PowerPC/load-shuffle-and-shuffle-store.ll @@ -19,8 +19,7 @@ ; ; CHECK-P9-LABEL: load_swap00: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv v2, 0(r3) -; CHECK-P9-NEXT: xxswapd v2, v2 +; CHECK-P9-NEXT: lxvd2x v2, 0, r3 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap00: @@ -48,8 +47,7 @@ ; ; CHECK-P9-LABEL: load_swap01: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv v2, 0(r4) -; CHECK-P9-NEXT: xxswapd v2, v2 +; CHECK-P9-NEXT: lxvd2x v2, 0, r4 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap01: @@ -72,20 +70,12 @@ define <4 x i32> @load_swap10(<4 x i32>* %vp1, <4 x i32>* %vp2) { ; CHECK-P8-LABEL: load_swap10: ; CHECK-P8: # %bb.0: -; CHECK-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha -; CHECK-P8-NEXT: lvx v3, 0, r3 -; CHECK-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l -; CHECK-P8-NEXT: lvx v2, 0, r4 -; CHECK-P8-NEXT: vperm v2, v3, v3, v2 +; CHECK-P8-NEXT: lxvw4x v2, 0, r3 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: load_swap10: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv v2, 0(r3) -; CHECK-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: lxvw4x v2, 0, r3 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap10: @@ -114,20 +104,12 @@ define <4 x i32> @load_swap11(<4 x i32>* %vp1, <4 x i32>* %vp2) { ; CHECK-P8-LABEL: load_swap11: ; CHECK-P8: # %bb.0: -; CHECK-P8-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; CHECK-P8-NEXT: lvx v3, 0, r4 -; CHECK-P8-NEXT: addi r3, r3, .LCPI3_0@toc@l -; CHECK-P8-NEXT: lvx v2, 0, r3 -; CHECK-P8-NEXT: vperm v2, v3, v3, v2 +; CHECK-P8-NEXT: lxvw4x v2, 0, r4 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: load_swap11: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI3_0@toc@l -; CHECK-P9-NEXT: lxv v2, 0(r4) -; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: lxvw4x v2, 0, r4 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap11: @@ -165,11 +147,7 @@ ; ; CHECK-P9-LABEL: load_swap20: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv v2, 0(r3) -; CHECK-P9-NEXT: addis r3, r2, .LCPI4_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI4_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: lxvh8x v2, 0, r3 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap20: @@ -207,11 +185,7 @@ ; ; CHECK-P9-LABEL: load_swap21: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis r3, r2, .LCPI5_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI5_0@toc@l -; CHECK-P9-NEXT: lxv v2, 0(r4) -; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: lxvh8x v2, 0, r4 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap21: @@ -249,8 +223,7 @@ ; ; CHECK-P9-LABEL: load_swap30: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv vs0, 0(r3) -; CHECK-P9-NEXT: xxbrq v2, vs0 +; CHECK-P9-NEXT: lxvb16x v2, 0, r3 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap30: @@ -285,8 +258,7 @@ ; ; CHECK-P9-LABEL: load_swap31: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv vs0, 0(r4) -; CHECK-P9-NEXT: xxbrq v2, vs0 +; CHECK-P9-NEXT: lxvb16x v2, 0, r4 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap31: @@ -317,8 +289,7 @@ ; ; CHECK-P9-LABEL: load_swap40: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv vs0, 0(r4) -; CHECK-P9-NEXT: xxswapd v2, vs0 +; CHECK-P9-NEXT: lxvd2x v2, 0, r4 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap40: @@ -341,20 +312,12 @@ define <4 x float> @load_swap50(<4 x float>* %vp1, <4 x float>* %vp2) { ; CHECK-P8-LABEL: load_swap50: ; CHECK-P8: # %bb.0: -; CHECK-P8-NEXT: addis r4, r2, .LCPI9_0@toc@ha -; CHECK-P8-NEXT: lvx v3, 0, r3 -; CHECK-P8-NEXT: addi r4, r4, .LCPI9_0@toc@l -; CHECK-P8-NEXT: lvx v2, 0, r4 -; CHECK-P8-NEXT: vperm v2, v3, v3, v2 +; CHECK-P8-NEXT: lxvw4x v2, 0, r3 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: load_swap50: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv v2, 0(r3) -; CHECK-P9-NEXT: addis r3, r2, .LCPI9_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI9_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: lxvw4x v2, 0, r3 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap50: @@ -383,20 +346,12 @@ define <4 x float> @load_swap51(<4 x float>* %vp1, <4 x float>* %vp2) { ; CHECK-P8-LABEL: load_swap51: ; CHECK-P8: # %bb.0: -; CHECK-P8-NEXT: addis r3, r2, .LCPI10_0@toc@ha -; CHECK-P8-NEXT: lvx v3, 0, r4 -; CHECK-P8-NEXT: addi r3, r3, .LCPI10_0@toc@l -; CHECK-P8-NEXT: lvx v2, 0, r3 -; CHECK-P8-NEXT: vperm v2, v3, v3, v2 +; CHECK-P8-NEXT: lxvw4x v2, 0, r4 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: load_swap51: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis r3, r2, .LCPI10_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI10_0@toc@l -; CHECK-P9-NEXT: lxv v2, 0(r4) -; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: vperm v2, v2, v2, v3 +; CHECK-P9-NEXT: lxvw4x v2, 0, r4 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: load_swap51: @@ -430,8 +385,7 @@ ; ; CHECK-P9-LABEL: swap_store00: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: xxswapd vs0, v2 -; CHECK-P9-NEXT: stxv vs0, 0(r7) +; CHECK-P9-NEXT: stxvd2x v2, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store00: @@ -458,8 +412,7 @@ ; ; CHECK-P9-LABEL: swap_store01: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: xxswapd vs0, v3 -; CHECK-P9-NEXT: stxv vs0, 0(r7) +; CHECK-P9-NEXT: stxvd2x v3, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store01: @@ -481,20 +434,12 @@ define void @swap_store10(<4 x i32> %v1, <4 x i32> %v2, <4 x i32>* %vp) { ; CHECK-P8-LABEL: swap_store10: ; CHECK-P8: # %bb.0: -; CHECK-P8-NEXT: addis r3, r2, .LCPI13_0@toc@ha -; CHECK-P8-NEXT: addi r3, r3, .LCPI13_0@toc@l -; CHECK-P8-NEXT: lvx v3, 0, r3 -; CHECK-P8-NEXT: vperm v2, v2, v2, v3 -; CHECK-P8-NEXT: stvx v2, 0, r7 +; CHECK-P8-NEXT: stxvw4x v2, 0, r7 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: swap_store10: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis r3, r2, .LCPI13_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI13_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: vperm v2, v2, v2, v3 -; CHECK-P9-NEXT: stxv v2, 0(r7) +; CHECK-P9-NEXT: stxvw4x v2, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store10: @@ -522,20 +467,12 @@ define void @swap_store11(<4 x i32> %v1, <4 x i32> %v2, <4 x i32>* %vp) { ; CHECK-P8-LABEL: swap_store11: ; CHECK-P8: # %bb.0: -; CHECK-P8-NEXT: addis r3, r2, .LCPI14_0@toc@ha -; CHECK-P8-NEXT: addi r3, r3, .LCPI14_0@toc@l -; CHECK-P8-NEXT: lvx v2, 0, r3 -; CHECK-P8-NEXT: vperm v2, v3, v3, v2 -; CHECK-P8-NEXT: stvx v2, 0, r7 +; CHECK-P8-NEXT: stxvw4x v3, 0, r7 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: swap_store11: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis r3, r2, .LCPI14_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI14_0@toc@l -; CHECK-P9-NEXT: lxvx v2, 0, r3 -; CHECK-P9-NEXT: vperm v2, v3, v3, v2 -; CHECK-P9-NEXT: stxv v2, 0(r7) +; CHECK-P9-NEXT: stxvw4x v3, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store11: @@ -572,11 +509,7 @@ ; ; CHECK-P9-LABEL: swap_store20: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis r3, r2, .LCPI15_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI15_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: vperm v2, v2, v2, v3 -; CHECK-P9-NEXT: stxv v2, 0(r7) +; CHECK-P9-NEXT: stxvh8x v2, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store20: @@ -613,11 +546,7 @@ ; ; CHECK-P9-LABEL: swap_store21: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l -; CHECK-P9-NEXT: lxvx v2, 0, r3 -; CHECK-P9-NEXT: vperm v2, v3, v3, v2 -; CHECK-P9-NEXT: stxv v2, 0(r7) +; CHECK-P9-NEXT: stxvh8x v3, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store21: @@ -654,8 +583,7 @@ ; ; CHECK-P9-LABEL: swap_store30: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: xxbrq vs0, v2 -; CHECK-P9-NEXT: stxv vs0, 0(r7) +; CHECK-P9-NEXT: stxvb16x v2, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store30: @@ -689,8 +617,7 @@ ; ; CHECK-P9-LABEL: swap_store31: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: xxbrq vs0, v3 -; CHECK-P9-NEXT: stxv vs0, 0(r7) +; CHECK-P9-NEXT: stxvb16x v3, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store31: @@ -720,8 +647,7 @@ ; ; CHECK-P9-LABEL: swap_store40: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: xxswapd vs0, v2 -; CHECK-P9-NEXT: stxv vs0, 0(r7) +; CHECK-P9-NEXT: stxvd2x v2, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store40: @@ -748,8 +674,7 @@ ; ; CHECK-P9-LABEL: swap_store41: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: xxswapd vs0, v3 -; CHECK-P9-NEXT: stxv vs0, 0(r7) +; CHECK-P9-NEXT: stxvd2x v3, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store41: @@ -771,20 +696,12 @@ define void @swap_store50(<4 x float> %v1, <4 x float> %v2, <4 x float>* %vp) { ; CHECK-P8-LABEL: swap_store50: ; CHECK-P8: # %bb.0: -; CHECK-P8-NEXT: addis r3, r2, .LCPI21_0@toc@ha -; CHECK-P8-NEXT: addi r3, r3, .LCPI21_0@toc@l -; CHECK-P8-NEXT: lvx v3, 0, r3 -; CHECK-P8-NEXT: vperm v2, v2, v2, v3 -; CHECK-P8-NEXT: stvx v2, 0, r7 +; CHECK-P8-NEXT: stxvw4x v2, 0, r7 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: swap_store50: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis r3, r2, .LCPI21_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI21_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r3 -; CHECK-P9-NEXT: vperm v2, v2, v2, v3 -; CHECK-P9-NEXT: stxv v2, 0(r7) +; CHECK-P9-NEXT: stxvw4x v2, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store50: @@ -812,20 +729,12 @@ define void @swap_store51(<4 x float> %v1, <4 x float> %v2, <4 x float>* %vp) { ; CHECK-P8-LABEL: swap_store51: ; CHECK-P8: # %bb.0: -; CHECK-P8-NEXT: addis r3, r2, .LCPI22_0@toc@ha -; CHECK-P8-NEXT: addi r3, r3, .LCPI22_0@toc@l -; CHECK-P8-NEXT: lvx v2, 0, r3 -; CHECK-P8-NEXT: vperm v2, v3, v3, v2 -; CHECK-P8-NEXT: stvx v2, 0, r7 +; CHECK-P8-NEXT: stxvw4x v3, 0, r7 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: swap_store51: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: addis r3, r2, .LCPI22_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI22_0@toc@l -; CHECK-P9-NEXT: lxvx v2, 0, r3 -; CHECK-P9-NEXT: vperm v2, v3, v3, v2 -; CHECK-P9-NEXT: stxv v2, 0(r7) +; CHECK-P9-NEXT: stxvw4x v3, 0, r7 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-BE-LABEL: swap_store51: Index: llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll =================================================================== --- llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll +++ llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll @@ -85,8 +85,7 @@ ; CHECK: lxvd2x 34, 0, 3 ; CHECK-P9-LABEL: @test10 -; CHECK-P9: lxv 0, 0(3) -; CHECK-P9: xxswapd 34, 0 +; CHECK-P9: lxvd2x 34, 0, 3 } define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) { @@ -257,8 +256,7 @@ ; CHECK: lxvd2x 34, 0, 4 ; CHECK-P9-LABEL: @test32 -; CHECK-P9: lxv 0, 0(4) -; CHECK-P9: xxswapd 34, 0 +; CHECK-P9: lxvd2x 34, 0, 4 } define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) {