diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20799,7 +20799,7 @@ /// completely. This has the potential to lose undef knowledge because the first /// shuffle may not have an undef mask element where the second one does. So /// only call this after doing simplifications based on demanded elements. -static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) { +static SDValue simplifyUnaryShuffleOfShuffle(ShuffleVectorSDNode *Shuf) { // shuf (shuf0 X, Y, Mask0), undef, Mask auto *Shuf0 = dyn_cast(Shuf->getOperand(0)); if (!Shuf0 || !Shuf->getOperand(1).isUndef()) @@ -20823,6 +20823,67 @@ return Shuf->getOperand(0); } +/// If we have a binary shuffle \p Shuf of an unary shuffle, fold the unary +/// shuffle away into \p Shuf and update its mask. +/// For Example: +/// Detect this pattern: +/// t0: v8i32 = vector_shuffle<0,5,u,u,u,u,u,u> t3, undef +/// t1: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t0, t4 +/// +/// Combine to this node: +/// t2: v8i32 = vector_shuffle<0,5,10,11,u,u,u,u> t3, t4 +static SDValue simplifyBinaryShuffleOfShuffle(ShuffleVectorSDNode *Shuf, + SelectionDAG &DAG) { + if (Shuf->getOperand(1).isUndef()) + return SDValue(); + // shuf (shuf0 X, Undef, Mask0), Y, Mask or shuf Y, (shuf0 X, Undef, Mask0) + auto *Shuf0 = dyn_cast(Shuf->getOperand(0)); + bool IsFirstOperandUnaryShuffle = true; + if (!Shuf0 || !Shuf0->getOperand(1).isUndef()) { + Shuf0 = dyn_cast(Shuf->getOperand(1)); + if (!Shuf0 || !Shuf0->getOperand(1).isUndef()) + return SDValue(); + IsFirstOperandUnaryShuffle = false; + } + + ArrayRef Mask = Shuf->getMask(); + ArrayRef Mask0 = Shuf0->getMask(); + SmallVector NewMask(Mask.size(), -1); + for (int i = 0, e = (int)Mask.size(); i != e; ++i) { + // Ignore undef elements. + if (Mask[i] == -1) + continue; + assert(Mask[i] >= 0 && Mask[i] < 2 * e && "Unexpected shuffle mask value"); + // Element taken from second operand + if (Mask[i] >= e) { + // If the first operand is being folded away then the mask is unchanged. + if (IsFirstOperandUnaryShuffle) + NewMask[i] = Mask[i]; + else { + assert(Mask0[Mask[i] - e] != -1 && + "Unexpected shuffle mask undef value"); + NewMask[i] = Mask0[Mask[i] - e] + e; + } + } else { + // Element taken from first operand + if (IsFirstOperandUnaryShuffle) { + assert(Mask0[Mask[i]] != -1 && "Unexpected shuffle mask undef value"); + NewMask[i] = Mask0[Mask[i]]; + } else + // If the second operand is being folded away then the mask is + // unchanged. + NewMask[i] = Mask[i]; + } + } + + SDValue ToShuffleLeft = + IsFirstOperandUnaryShuffle ? Shuf0->getOperand(0) : Shuf->getOperand(0); + SDValue ToShuffleRight = + IsFirstOperandUnaryShuffle ? Shuf->getOperand(1) : Shuf0->getOperand(0); + return DAG.getVectorShuffle(Shuf->getValueType(0), SDLoc(Shuf), ToShuffleLeft, + ToShuffleRight, NewMask); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -20954,9 +21015,15 @@ // This is intentionally placed after demanded elements simplification because // it could eliminate knowledge of undef elements created by this shuffle. - if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN)) + if (SDValue ShufOp = simplifyUnaryShuffleOfShuffle(SVN)) return ShufOp; + if (Level == AfterLegalizeDAG) { + SDValue ShufOp = simplifyBinaryShuffleOfShuffle(SVN, DAG); + if (ShufOp) + return ShufOp; + } + // Match shuffles that can be converted to any_vector_extend_in_reg. if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) return V; diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll --- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -349,23 +349,25 @@ ; CHECK-LABEL: test16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sldi r4, r4, 1 -; CHECK-NEXT: li r7, 16 ; CHECK-NEXT: add r6, r3, r4 -; CHECK-NEXT: lxsihzx v4, r3, r4 +; CHECK-NEXT: lxsihzx v2, r3, r4 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: mtvsrd v3, r3 ; CHECK-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; CHECK-NEXT: lxsihzx v2, r6, r7 -; CHECK-NEXT: li r6, 0 ; CHECK-NEXT: addi r3, r3, .LCPI3_0@toc@l -; CHECK-NEXT: mtvsrd v3, r6 -; CHECK-NEXT: vsplth v4, v4, 3 ; CHECK-NEXT: vsplth v2, v2, 3 -; CHECK-NEXT: vmrghh v4, v3, v4 +; CHECK-NEXT: lxvx v4, 0, r3 +; CHECK-NEXT: li r3, 16 ; CHECK-NEXT: vmrghh v2, v3, v2 -; CHECK-NEXT: vsplth v3, v3, 3 -; CHECK-NEXT: vmrglw v3, v4, v3 +; CHECK-NEXT: vperm v2, v2, v3, v4 +; CHECK-NEXT: lxsihzx v4, r6, r3 +; CHECK-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI3_1@toc@l +; CHECK-NEXT: vsplth v4, v4, 3 +; CHECK-NEXT: vmrghh v3, v3, v4 ; CHECK-NEXT: lxvx v4, 0, r3 ; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: vperm v2, v2, v3, v4 +; CHECK-NEXT: vperm v2, v3, v2, v4 ; CHECK-NEXT: xxspltw v3, v2, 2 ; CHECK-NEXT: vadduwm v2, v2, v3 ; CHECK-NEXT: vextuwrx r3, r3, v2 @@ -376,24 +378,26 @@ ; P9BE-LABEL: test16: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: sldi r4, r4, 1 -; P9BE-NEXT: li r7, 16 ; P9BE-NEXT: add r6, r3, r4 -; P9BE-NEXT: lxsihzx v4, r3, r4 +; P9BE-NEXT: lxsihzx v2, r3, r4 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; P9BE-NEXT: lxsihzx v2, r6, r7 -; P9BE-NEXT: li r6, 0 -; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l -; P9BE-NEXT: sldi r6, r6, 48 -; P9BE-NEXT: vsplth v4, v4, 3 -; P9BE-NEXT: mtvsrd v3, r6 ; P9BE-NEXT: vsplth v2, v2, 3 -; P9BE-NEXT: vmrghh v4, v3, v4 +; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l +; P9BE-NEXT: lxvx v4, 0, r3 ; P9BE-NEXT: vmrghh v2, v3, v2 -; P9BE-NEXT: vsplth v3, v3, 0 -; P9BE-NEXT: vmrghw v3, v3, v4 +; P9BE-NEXT: li r3, 16 +; P9BE-NEXT: vperm v2, v3, v2, v4 +; P9BE-NEXT: lxsihzx v4, r6, r3 +; P9BE-NEXT: addis r3, r2, .LCPI3_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI3_1@toc@l +; P9BE-NEXT: vsplth v4, v4, 3 +; P9BE-NEXT: vmrghh v3, v3, v4 ; P9BE-NEXT: lxvx v4, 0, r3 ; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: vperm v2, v3, v2, v4 +; P9BE-NEXT: vperm v2, v2, v3, v4 ; P9BE-NEXT: xxspltw v3, v2, 1 ; P9BE-NEXT: vadduwm v2, v2, v3 ; P9BE-NEXT: vextuwlx r3, r3, v2 @@ -439,19 +443,28 @@ ; CHECK-NEXT: lxsibzx v2, r3, r4 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: mtvsrd v3, r3 -; CHECK-NEXT: li r3, 8 -; CHECK-NEXT: lxsibzx v5, r6, r3 -; CHECK-NEXT: vspltb v4, v3, 7 ; CHECK-NEXT: addis r3, r2, .LCPI4_0@toc@ha -; CHECK-NEXT: vspltb v2, v2, 7 ; CHECK-NEXT: addi r3, r3, .LCPI4_0@toc@l +; CHECK-NEXT: vspltb v5, v3, 7 +; CHECK-NEXT: vspltb v2, v2, 7 +; CHECK-NEXT: lxvx v4, 0, r3 +; CHECK-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI4_1@toc@l ; CHECK-NEXT: vmrghb v2, v3, v2 +; CHECK-NEXT: lxvx v0, 0, r3 +; CHECK-NEXT: li r3, 8 +; CHECK-NEXT: vperm v2, v2, v3, v4 +; CHECK-NEXT: vperm v2, v2, v5, v0 +; CHECK-NEXT: lxsibzx v5, r6, r3 +; CHECK-NEXT: addis r3, r2, .LCPI4_2@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI4_2@toc@l ; CHECK-NEXT: vspltb v5, v5, 7 -; CHECK-NEXT: vmrglh v2, v2, v4 -; CHECK-NEXT: vmrghb v3, v3, v5 -; CHECK-NEXT: vmrglw v2, v2, v4 -; CHECK-NEXT: vmrglh v3, v3, v4 -; CHECK-NEXT: vmrglw v3, v4, v3 +; CHECK-NEXT: vmrghb v5, v3, v5 +; CHECK-NEXT: vperm v4, v5, v3, v4 +; CHECK-NEXT: lxvx v5, 0, r3 +; CHECK-NEXT: addis r3, r2, .LCPI4_3@toc@ha +; CHECK-NEXT: addi r3, r3, .LCPI4_3@toc@l +; CHECK-NEXT: vperm v3, v3, v4, v5 ; CHECK-NEXT: lxvx v4, 0, r3 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: vperm v2, v3, v2, v4 @@ -465,23 +478,26 @@ ; P9BE-LABEL: test8: ; P9BE: # %bb.0: # %entry ; P9BE-NEXT: add r6, r3, r4 -; P9BE-NEXT: li r7, 8 -; P9BE-NEXT: lxsibzx v4, r3, r4 +; P9BE-NEXT: lxsibzx v2, r3, r4 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: sldi r3, r3, 56 +; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha -; P9BE-NEXT: lxsibzx v2, r6, r7 -; P9BE-NEXT: li r6, 0 +; P9BE-NEXT: vspltb v2, v2, 7 ; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l -; P9BE-NEXT: sldi r6, r6, 56 +; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: vmrghb v2, v3, v2 +; P9BE-NEXT: li r3, 8 +; P9BE-NEXT: vperm v2, v2, v3, v4 +; P9BE-NEXT: lxsibzx v4, r6, r3 +; P9BE-NEXT: addis r3, r2, .LCPI4_1@toc@ha +; P9BE-NEXT: addi r3, r3, .LCPI4_1@toc@l ; P9BE-NEXT: vspltb v4, v4, 7 -; P9BE-NEXT: mtvsrd v3, r6 -; P9BE-NEXT: vspltb v2, v2, 7 ; P9BE-NEXT: vmrghb v4, v3, v4 -; P9BE-NEXT: vmrghb v2, v3, v2 ; P9BE-NEXT: vspltb v3, v3, 0 -; P9BE-NEXT: vmrghh v4, v4, v3 -; P9BE-NEXT: xxspltw v3, v3, 0 -; P9BE-NEXT: vmrghw v2, v4, v2 +; P9BE-NEXT: vmrghw v2, v2, v4 ; P9BE-NEXT: lxvx v4, 0, r3 +; P9BE-NEXT: xxspltw v3, v3, 0 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vperm v2, v3, v2, v4 ; P9BE-NEXT: xxspltw v3, v2, 1 diff --git a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll --- a/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_vector_test_4.ll @@ -254,10 +254,12 @@ define <2 x float> @s2v_test_f2(float* nocapture readonly %f64, <2 x float> %vec) { ; P9LE-LABEL: s2v_test_f2: ; P9LE: # %bb.0: # %entry +; P9LE-NEXT: addis r4, r2, .LCPI6_0@toc@ha ; P9LE-NEXT: addi r3, r3, 4 -; P9LE-NEXT: vmrglw v2, v2, v2 -; P9LE-NEXT: lxsiwzx v3, 0, r3 -; P9LE-NEXT: vmrghw v2, v2, v3 +; P9LE-NEXT: addi r4, r4, .LCPI6_0@toc@l +; P9LE-NEXT: lxsiwzx v4, 0, r3 +; P9LE-NEXT: lxvx v3, 0, r4 +; P9LE-NEXT: vperm v2, v2, v4, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: s2v_test_f2: @@ -271,10 +273,12 @@ ; ; P8LE-LABEL: s2v_test_f2: ; P8LE: # %bb.0: # %entry -; P8LE-NEXT: vmrglw v2, v2, v2 +; P8LE-NEXT: addis r4, r2, .LCPI6_0@toc@ha ; P8LE-NEXT: addi r3, r3, 4 -; P8LE-NEXT: lxsiwzx v3, 0, r3 -; P8LE-NEXT: vmrghw v2, v2, v3 +; P8LE-NEXT: addi r4, r4, .LCPI6_0@toc@l +; P8LE-NEXT: lxsiwzx v4, 0, r3 +; P8LE-NEXT: lvx v3, 0, r4 +; P8LE-NEXT: vperm v2, v2, v4, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: s2v_test_f2: @@ -297,9 +301,11 @@ ; P9LE-LABEL: s2v_test_f3: ; P9LE: # %bb.0: # %entry ; P9LE-NEXT: sldi r4, r7, 2 -; P9LE-NEXT: vmrglw v2, v2, v2 ; P9LE-NEXT: lxsiwzx v3, r3, r4 -; P9LE-NEXT: vmrghw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI7_0@toc@ha +; P9LE-NEXT: addi r3, r3, .LCPI7_0@toc@l +; P9LE-NEXT: lxvx v4, 0, r3 +; P9LE-NEXT: vperm v2, v2, v3, v4 ; P9LE-NEXT: blr ; ; P9BE-LABEL: s2v_test_f3: @@ -313,10 +319,12 @@ ; ; P8LE-LABEL: s2v_test_f3: ; P8LE: # %bb.0: # %entry -; P8LE-NEXT: vmrglw v2, v2, v2 -; P8LE-NEXT: sldi r4, r7, 2 -; P8LE-NEXT: lxsiwzx v3, r3, r4 -; P8LE-NEXT: vmrghw v2, v2, v3 +; P8LE-NEXT: addis r4, r2, .LCPI7_0@toc@ha +; P8LE-NEXT: sldi r5, r7, 2 +; P8LE-NEXT: addi r4, r4, .LCPI7_0@toc@l +; P8LE-NEXT: lxsiwzx v3, r3, r5 +; P8LE-NEXT: lvx v4, 0, r4 +; P8LE-NEXT: vperm v2, v2, v3, v4 ; P8LE-NEXT: blr ; ; P8BE-LABEL: s2v_test_f3: @@ -339,10 +347,12 @@ define <2 x float> @s2v_test_f4(float* nocapture readonly %f64, <2 x float> %vec) { ; P9LE-LABEL: s2v_test_f4: ; P9LE: # %bb.0: # %entry +; P9LE-NEXT: addis r4, r2, .LCPI8_0@toc@ha ; P9LE-NEXT: addi r3, r3, 4 -; P9LE-NEXT: vmrglw v2, v2, v2 -; P9LE-NEXT: lxsiwzx v3, 0, r3 -; P9LE-NEXT: vmrghw v2, v2, v3 +; P9LE-NEXT: addi r4, r4, .LCPI8_0@toc@l +; P9LE-NEXT: lxsiwzx v4, 0, r3 +; P9LE-NEXT: lxvx v3, 0, r4 +; P9LE-NEXT: vperm v2, v2, v4, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: s2v_test_f4: @@ -356,10 +366,12 @@ ; ; P8LE-LABEL: s2v_test_f4: ; P8LE: # %bb.0: # %entry -; P8LE-NEXT: vmrglw v2, v2, v2 +; P8LE-NEXT: addis r4, r2, .LCPI8_0@toc@ha ; P8LE-NEXT: addi r3, r3, 4 -; P8LE-NEXT: lxsiwzx v3, 0, r3 -; P8LE-NEXT: vmrghw v2, v2, v3 +; P8LE-NEXT: addi r4, r4, .LCPI8_0@toc@l +; P8LE-NEXT: lxsiwzx v4, 0, r3 +; P8LE-NEXT: lvx v3, 0, r4 +; P8LE-NEXT: vperm v2, v2, v4, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: s2v_test_f4: @@ -381,9 +393,11 @@ define <2 x float> @s2v_test_f5(<2 x float> %vec, float* nocapture readonly %ptr1) { ; P9LE-LABEL: s2v_test_f5: ; P9LE: # %bb.0: # %entry -; P9LE-NEXT: lxsiwzx v3, 0, r5 -; P9LE-NEXT: vmrglw v2, v2, v2 -; P9LE-NEXT: vmrghw v2, v2, v3 +; P9LE-NEXT: addis r3, r2, .LCPI9_0@toc@ha +; P9LE-NEXT: lxsiwzx v4, 0, r5 +; P9LE-NEXT: addi r3, r3, .LCPI9_0@toc@l +; P9LE-NEXT: lxvx v3, 0, r3 +; P9LE-NEXT: vperm v2, v2, v4, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: s2v_test_f5: @@ -396,9 +410,11 @@ ; ; P8LE-LABEL: s2v_test_f5: ; P8LE: # %bb.0: # %entry -; P8LE-NEXT: vmrglw v2, v2, v2 -; P8LE-NEXT: lxsiwzx v3, 0, r5 -; P8LE-NEXT: vmrghw v2, v2, v3 +; P8LE-NEXT: addis r3, r2, .LCPI9_0@toc@ha +; P8LE-NEXT: lxsiwzx v4, 0, r5 +; P8LE-NEXT: addi r3, r3, .LCPI9_0@toc@l +; P8LE-NEXT: lvx v3, 0, r3 +; P8LE-NEXT: vperm v2, v2, v4, v3 ; P8LE-NEXT: blr ; ; P8BE-LABEL: s2v_test_f5: