diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1211,6 +1211,8 @@ SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineVectorShuffle(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG) const; SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -126,6 +126,7 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); +STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM"); static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); @@ -9385,6 +9386,15 @@ SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); ShuffleVectorSDNode *SVOp = cast(Op); + + // Any nodes that were combined in the target-independent combiner prior + // to vector legalization will not be sent to the target combine. Try to + // combine it here. + if (SDValue NewShuffle = combineVectorShuffle(SVOp, DAG)) { + DAG.ReplaceAllUsesOfValueWith(Op, NewShuffle); + Op = NewShuffle; + SVOp = cast(Op); + } EVT VT = Op.getValueType(); bool isLittleEndian = Subtarget.isLittleEndian(); @@ -9683,7 +9693,13 @@ MVT::i32)); } + ShufflesHandledWithVPERM++; SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask); + LLVM_DEBUG(dbgs() << "Emitting a VPERM for the following shuffle:\n"); + LLVM_DEBUG(SVOp->dump()); + LLVM_DEBUG(dbgs() << "With the following permute control vector:\n"); + LLVM_DEBUG(VPermMask.dump()); + if (isLittleEndian) return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V2, V1, VPermMask); @@ -13801,6 +13817,87 @@ return Val; } +static bool isAlternatingShuffMask(const ArrayRef &Mask, int NumElts) { + // Check that the source of the element keeps flipping + // (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts). + bool PrevElemFromFirstVec = Mask[0] < NumElts; + for (int i = 1, e = Mask.size(); i < e; i++) { + if (PrevElemFromFirstVec && Mask[i] < NumElts) + return false; + if (!PrevElemFromFirstVec && Mask[i] >= NumElts) + return false; + PrevElemFromFirstVec = !PrevElemFromFirstVec; + } + return true; +} + +static bool isSplatBV(SDValue Op) { + if (Op.getOpcode() != ISD::BUILD_VECTOR) + return false; + SDValue FirstOp = Op.getOperand(0); + for (int i = 1, e = Op.getNumOperands(); i < e; i++) + if (Op.getOperand(i) != FirstOp) + return false; + return true; +} + +// On little endian subtargets, combine shuffles such as: +// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, , %b +// into: +// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, , %b +// because the latter can be matched to a single instruction merge. +SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG) const { + SDValue LHS = SVN->getOperand(0); + SDValue RHS = SVN->getOperand(1); + auto Mask = SVN->getMask(); + int NumElts = LHS.getValueType().getVectorNumElements(); + SDValue Res; + SDLoc dl(SVN); + + // If this is not a shuffle of a shuffle and the first element comes from + // the second vector, canonicalize to the commuted form. This will make it + // more likely to match one of the single instruction patterns. + if (Subtarget.isLittleEndian() && Mask[0] >= NumElts && + LHS.getOpcode() != ISD::VECTOR_SHUFFLE && + RHS.getOpcode() != ISD::VECTOR_SHUFFLE) { + std::swap(LHS, RHS); + Res = DAG.getCommutedVectorShuffle(*SVN); + Mask = cast(Res)->getMask(); + } + bool IsLHSSplat = isSplatBV(LHS); + bool IsRHSSplat = isSplatBV(RHS); + if (!IsLHSSplat && !IsRHSSplat) + return Res; + + // We are looking for a mask such that all even elements are from + // one vector and all odd elements from the other. + if (!isAlternatingShuffMask(Mask, NumElts)) + return Res; + + SmallVector ShuffV(Mask.size()); + for (int i = 0, e = Mask.size(); i < e; i++) + ShuffV[i] = Mask[i]; + + // The common case after we commuted the shuffle is that the RHS is a splat + // and we have elements coming in from the splat at indices that are not + // condusive to using a merge. + // Example: + // vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, + if (!IsRHSSplat || Mask[0] >= NumElts) + return Res; + + // Adjust the mask so we are pulling in the same index from the splat + // as the index from the interesting vector in consecutive elements. + // Example: + // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, + for (int i = 1, e = Mask.size(); i < e; i += 2) + ShuffV[i] = (ShuffV[i - 1] + NumElts); + + Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV); + return Res; +} + SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase, DAGCombinerInfo &DCI) const { @@ -13907,7 +14004,7 @@ LSBaseSDNode* LSBase = cast(N->getOperand(0)); return combineVReverseMemOP(cast(N), LSBase, DCI); } - break; + return combineVectorShuffle(cast(N), DCI.DAG); case ISD::STORE: { EVT Op1VT = N->getOperand(1).getValueType(); diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -0,0 +1,201 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-P8 +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-P9 + +define dso_local <16 x i8> @testmrghb(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrghb: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrghb v2, v3, v2 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrghb: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrghb v2, v3, v2 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrghb2(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrghb2: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrghb2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrghh(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrghh: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrghh: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrghh2(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrghh2: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrghh2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrglb(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrglb: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrglb v2, v3, v2 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrglb: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrglb2(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrglb2: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrglb v2, v2, v3 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrglb2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrglb v2, v2, v3 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrglh(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrglh: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrglh: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrglh2(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrglh2: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrglh2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrghw(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrghw: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrghw v2, v3, v2 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrghw: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrghw v2, v3, v2 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrghw2(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrghw2: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrghw v2, v2, v3 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrghw2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrghw v2, v2, v3 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrglw(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrglw: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrglw v2, v3, v2 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrglw: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrglw v2, v3, v2 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} +define dso_local <16 x i8> @testmrglw2(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrglw2: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: vmrglw v2, v2, v3 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrglw2: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: vmrglw v2, v2, v3 +; CHECK-P9-NEXT: blr +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %shuffle +} + +define dso_local <8 x i16> @testmrglb3(<8 x i8>* nocapture readonly %a) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testmrglb3: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: ld r3, 0(r3) +; CHECK-P8-NEXT: xxlxor v3, v3, v3 +; CHECK-P8-NEXT: mtvsrd f0, r3 +; CHECK-P8-NEXT: xxswapd v2, vs0 +; CHECK-P8-NEXT: vmrglb v2, v3, v2 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testmrglb3: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lfd f0, 0(r3) +; CHECK-P9-NEXT: xxpermdi v2, f0, f0, 2 +; CHECK-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-P9-NEXT: blr +entry: + %0 = load <8 x i8>, <8 x i8>* %a, align 8 + %1 = zext <8 x i8> %0 to <8 x i16> + ret <8 x i16> %1 +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll --- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -24,7 +24,7 @@ ; CHECK-NEXT: vperm v0, v3, v5, v2 ; CHECK-NEXT: mtctr r5 ; CHECK-NEXT: li r5, 0 -; CHECK-NEXT: vperm v1, v5, v3, v4 +; CHECK-NEXT: vperm v1, v3, v5, v4 ; CHECK-NEXT: li r6, 0 ; CHECK-NEXT: xvnegsp v5, v0 ; CHECK-NEXT: xvnegsp v0, v1 @@ -34,7 +34,7 @@ ; CHECK-NEXT: lfd f0, 0(r3) ; CHECK-NEXT: xxpermdi v1, f0, f0, 2 ; CHECK-NEXT: lfdx f0, r3, r4 -; CHECK-NEXT: vperm v6, v1, v3, v4 +; CHECK-NEXT: vperm v6, v3, v1, v4 ; CHECK-NEXT: vperm v1, v3, v1, v2 ; CHECK-NEXT: xvnegsp v1, v1 ; CHECK-NEXT: add r7, r3, r4 @@ -48,7 +48,7 @@ ; CHECK-NEXT: vadduwm v1, v1, v6 ; CHECK-NEXT: xxpermdi v6, f0, f0, 2 ; CHECK-NEXT: vextuwrx r3, r5, v1 -; CHECK-NEXT: vperm v7, v6, v3, v4 +; CHECK-NEXT: vperm v7, v3, v6, v4 ; CHECK-NEXT: vperm v6, v3, v6, v2 ; CHECK-NEXT: add r6, r3, r6 ; CHECK-NEXT: add r3, r7, r4 @@ -192,10 +192,10 @@ ; CHECK-NEXT: xxlxor v3, v3, v3 ; CHECK-NEXT: lxvx v0, 0, r3 ; CHECK-NEXT: xxpermdi v1, f0, f0, 2 -; CHECK-NEXT: vperm v5, v2, v3, v4 +; CHECK-NEXT: vperm v5, v3, v2, v4 ; CHECK-NEXT: vperm v2, v3, v2, v0 ; CHECK-NEXT: vperm v0, v3, v1, v0 -; CHECK-NEXT: vperm v3, v1, v3, v4 +; CHECK-NEXT: vperm v3, v3, v1, v4 ; CHECK-NEXT: vabsduw v2, v2, v0 ; CHECK-NEXT: vabsduw v3, v5, v3 ; CHECK-NEXT: vadduwm v2, v3, v2 diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll --- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll @@ -16,7 +16,7 @@ ; P8LE-NEXT: addi r3, r4, .LCPI0_0@toc@l ; P8LE-NEXT: lvx v4, 0, r3 ; P8LE-NEXT: xxpermdi v3, f0, f0, 2 -; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: vperm v2, v2, v3, v4 ; P8LE-NEXT: blr ; P8BE-LABEL: s2v_test1: @@ -42,7 +42,7 @@ ; P8LE-NEXT: addi r3, r4, .LCPI1_0@toc@l ; P8LE-NEXT: lvx v4, 0, r3 ; P8LE-NEXT: xxpermdi v3, f0, f0, 2 -; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: vperm v2, v2, v3, v4 ; P8LE-NEXT: blr ; P8BE-LABEL: s2v_test2: @@ -70,7 +70,7 @@ ; P8LE-NEXT: addi r3, r4, .LCPI2_0@toc@l ; P8LE-NEXT: lvx v4, 0, r3 ; P8LE-NEXT: xxpermdi v3, f0, f0, 2 -; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: vperm v2, v2, v3, v4 ; P8LE-NEXT: blr ; P8BE-LABEL: s2v_test3: @@ -99,7 +99,7 @@ ; P8LE-NEXT: addi r3, r4, .LCPI3_0@toc@l ; P8LE-NEXT: lvx v4, 0, r3 ; P8LE-NEXT: xxpermdi v3, f0, f0, 2 -; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: vperm v2, v2, v3, v4 ; P8LE-NEXT: blr ; P8BE-LABEL: s2v_test4: @@ -126,7 +126,7 @@ ; P8LE-NEXT: addi r3, r3, .LCPI4_0@toc@l ; P8LE-NEXT: lvx v4, 0, r3 ; P8LE-NEXT: xxpermdi v3, f0, f0, 2 -; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: vperm v2, v2, v3, v4 ; P8LE-NEXT: blr ; P8BE-LABEL: s2v_test5: @@ -151,7 +151,7 @@ ; P8LE-NEXT: addi r3, r4, .LCPI5_0@toc@l ; P8LE-NEXT: lvx v4, 0, r3 ; P8LE-NEXT: xxpermdi v3, f0, f0, 2 -; P8LE-NEXT: vperm v2, v3, v2, v4 +; P8LE-NEXT: vperm v2, v2, v3, v4 ; P8LE-NEXT: blr ; P8BE-LABEL: s2v_test_f1: diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll --- a/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_i16_to_fp32_elts.ll @@ -80,25 +80,19 @@ define <4 x float> @test4elt(i64 %a.coerce) local_unnamed_addr #1 { ; CHECK-P8-LABEL: test4elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: addis r4, r2, .LCPI1_0@toc@ha ; CHECK-P8-NEXT: mtvsrd f0, r3 -; CHECK-P8-NEXT: addi r3, r4, .LCPI1_0@toc@l -; CHECK-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-P8-NEXT: xxlxor v3, v3, v3 ; CHECK-P8-NEXT: xxswapd v2, vs0 -; CHECK-P8-NEXT: lvx v3, 0, r3 -; CHECK-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-P8-NEXT: vmrglh v2, v3, v2 ; CHECK-P8-NEXT: xvcvuxwsp v2, v2 ; CHECK-P8-NEXT: blr ; ; CHECK-P9-LABEL: test4elt: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrd f0, r3 -; CHECK-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha -; CHECK-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r3 ; CHECK-P9-NEXT: xxswapd v2, vs0 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 -; CHECK-P9-NEXT: vperm v2, v4, v2, v3 +; CHECK-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-P9-NEXT: xvcvuxwsp v2, v2 ; CHECK-P9-NEXT: blr ; @@ -121,17 +115,11 @@ define void @test8elt(<8 x float>* noalias nocapture sret %agg.result, <8 x i16> %a) local_unnamed_addr #2 { ; CHECK-P8-LABEL: test8elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha -; CHECK-P8-NEXT: addis r5, r2, .LCPI2_1@toc@ha -; CHECK-P8-NEXT: xxlxor v4, v4, v4 -; CHECK-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l -; CHECK-P8-NEXT: lvx v3, 0, r4 -; CHECK-P8-NEXT: addi r4, r5, .LCPI2_1@toc@l -; CHECK-P8-NEXT: lvx v5, 0, r4 +; CHECK-P8-NEXT: xxlxor v3, v3, v3 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: vperm v3, v4, v2, v3 -; CHECK-P8-NEXT: vperm v2, v4, v2, v5 -; CHECK-P8-NEXT: xvcvuxwsp v3, v3 +; CHECK-P8-NEXT: vmrglh v4, v3, v2 +; CHECK-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-P8-NEXT: xvcvuxwsp v3, v4 ; CHECK-P8-NEXT: xvcvuxwsp v2, v2 ; CHECK-P8-NEXT: stvx v3, 0, r3 ; CHECK-P8-NEXT: stvx v2, r3, r4 @@ -139,19 +127,13 @@ ; ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha -; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l -; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: xxlxor v4, v4, v4 -; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha -; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l -; CHECK-P9-NEXT: vperm v3, v4, v2, v3 -; CHECK-P9-NEXT: xvcvuxwsp vs0, v3 -; CHECK-P9-NEXT: lxvx v3, 0, r4 -; CHECK-P9-NEXT: vperm v2, v4, v2, v3 -; CHECK-P9-NEXT: stxv vs0, 0(r3) +; CHECK-P9-NEXT: xxlxor v3, v3, v3 +; CHECK-P9-NEXT: vmrglh v4, v3, v2 +; CHECK-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-P9-NEXT: xvcvuxwsp vs0, v4 ; CHECK-P9-NEXT: xvcvuxwsp vs1, v2 ; CHECK-P9-NEXT: stxv vs1, 16(r3) +; CHECK-P9-NEXT: stxv vs0, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test8elt: