diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -10314,11 +10314,6 @@ bool isLittleEndian = Subtarget.isLittleEndian(); bool isPPC64 = Subtarget.isPPC64(); - // Only need to place items backwards in LE, - // the mask will be properly calculated. - if (isLittleEndian) - std::swap(V1, V2); - if (Subtarget.hasVSX() && Subtarget.hasP9Vector() && (V1->hasOneUse() || V2->hasOneUse())) { LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using " @@ -10328,7 +10323,8 @@ // The second input to XXPERM is also an output so if the second input has // multiple uses then copying is necessary, as a result we want the // single-use operand to be used as the second input to prevent copying. - if (!V2->hasOneUse() && V1->hasOneUse()) { + if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) || + (isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) { std::swap(V1, V2); NeedSwap = !NeedSwap; } @@ -10367,27 +10363,24 @@ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; - if (Opcode == PPCISD::XXPERM) { - if (V1HasXXSWAPD) { - if (SrcElt < 8) - SrcElt += 8; - else if (SrcElt < 16) - SrcElt -= 8; - } - if (V2HasXXSWAPD) { - if (SrcElt > 23) - SrcElt -= 8; - else if (SrcElt > 15) - SrcElt += 8; - } - if (NeedSwap) { - if (SrcElt < 16) - SrcElt += 16; - else - SrcElt -= 16; - } + if (V1HasXXSWAPD) { + if (SrcElt < 8) + SrcElt += 8; + else if (SrcElt < 16) + SrcElt -= 8; + } + if (V2HasXXSWAPD) { + if (SrcElt > 23) + SrcElt -= 8; + else if (SrcElt > 15) + SrcElt += 8; + } + if (NeedSwap) { + if (SrcElt < 16) + SrcElt += 16; + else + SrcElt -= 16; } - for (unsigned j = 0; j != BytesPerElement; ++j) if (isLittleEndian) ResultMask.push_back( @@ -10397,18 +10390,19 @@ DAG.getConstant(SrcElt * BytesPerElement + j, dl, MVT::i32)); } - if (Opcode == PPCISD::XXPERM && (V1HasXXSWAPD || V2HasXXSWAPD)) { - if (V1HasXXSWAPD) { - dl = SDLoc(V1->getOperand(0)); - V1 = V1->getOperand(0)->getOperand(1); - } - if (V2HasXXSWAPD) { - dl = SDLoc(V2->getOperand(0)); - V2 = V2->getOperand(0)->getOperand(1); - } - if (isPPC64 && ValType != MVT::v2f64) + if (V1HasXXSWAPD) { + dl = SDLoc(V1->getOperand(0)); + V1 = V1->getOperand(0)->getOperand(1); + } + if (V2HasXXSWAPD) { + dl = SDLoc(V2->getOperand(0)); + V2 = V2->getOperand(0)->getOperand(1); + } + + if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) { + if (ValType != MVT::v2f64) V1 = DAG.getBitcast(MVT::v2f64, V1); - if (isPPC64 && V2.getValueType() != MVT::v2f64) + if (V2.getValueType() != MVT::v2f64) V2 = DAG.getBitcast(MVT::v2f64, V2); } @@ -10429,6 +10423,11 @@ if (Opcode == PPCISD::XXPERM) VPermMask = DAG.getBitcast(MVT::v4i32, VPermMask); + // Only need to place items backwards in LE, + // the mask was properly calculated. + if (isLittleEndian) + std::swap(V1, V2); + SDValue VPERMNode = DAG.getNode(Opcode, dl, V1.getValueType(), V1, V2, VPermMask); diff --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll --- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll +++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll @@ -1058,16 +1058,15 @@ ; ; P8LE-LABEL: fromDiffMemVarDi: ; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addis r5, r2, .LCPI9_0@toc@ha ; P8LE-NEXT: sldi r4, r4, 2 +; P8LE-NEXT: addi r5, r5, .LCPI9_0@toc@l ; P8LE-NEXT: add r3, r3, r4 +; P8LE-NEXT: lxvd2x vs0, 0, r5 ; P8LE-NEXT: addi r3, r3, -12 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: addis r3, r2, .LCPI9_0@toc@ha -; P8LE-NEXT: addi r3, r3, .LCPI9_0@toc@l +; P8LE-NEXT: lxvd2x v3, 0, r3 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: xxswapd v3, vs0 -; P8LE-NEXT: vperm v2, v2, v2, v3 +; P8LE-NEXT: vperm v2, v3, v3, v2 ; P8LE-NEXT: blr entry: %idxprom = sext i32 %elem to i64 @@ -1478,13 +1477,12 @@ ; ; P8LE-LABEL: fromDiffMemConsDConvftoi: ; P8LE: # %bb.0: # %entry -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: addis r3, r2, .LCPI18_0@toc@ha -; P8LE-NEXT: addi r3, r3, .LCPI18_0@toc@l +; P8LE-NEXT: addis r4, r2, .LCPI18_0@toc@ha +; P8LE-NEXT: lxvd2x v3, 0, r3 +; P8LE-NEXT: addi r4, r4, .LCPI18_0@toc@l +; P8LE-NEXT: lxvd2x vs0, 0, r4 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: xxswapd v3, vs0 -; P8LE-NEXT: vperm v2, v2, v2, v3 +; P8LE-NEXT: vperm v2, v3, v3, v2 ; P8LE-NEXT: xvcvspsxws v2, v2 ; P8LE-NEXT: blr entry: @@ -2580,16 +2578,15 @@ ; ; P8LE-LABEL: fromDiffMemVarDui: ; P8LE: # %bb.0: # %entry +; P8LE-NEXT: addis r5, r2, .LCPI41_0@toc@ha ; P8LE-NEXT: sldi r4, r4, 2 +; P8LE-NEXT: addi r5, r5, .LCPI41_0@toc@l ; P8LE-NEXT: add r3, r3, r4 +; P8LE-NEXT: lxvd2x vs0, 0, r5 ; P8LE-NEXT: addi r3, r3, -12 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: addis r3, r2, .LCPI41_0@toc@ha -; P8LE-NEXT: addi r3, r3, .LCPI41_0@toc@l +; P8LE-NEXT: lxvd2x v3, 0, r3 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: xxswapd v3, vs0 -; P8LE-NEXT: vperm v2, v2, v2, v3 +; P8LE-NEXT: vperm v2, v3, v3, v2 ; P8LE-NEXT: blr entry: %idxprom = sext i32 %elem to i64 @@ -3000,13 +2997,12 @@ ; ; P8LE-LABEL: fromDiffMemConsDConvftoui: ; P8LE: # %bb.0: # %entry -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: addis r3, r2, .LCPI50_0@toc@ha -; P8LE-NEXT: addi r3, r3, .LCPI50_0@toc@l +; P8LE-NEXT: addis r4, r2, .LCPI50_0@toc@ha +; P8LE-NEXT: lxvd2x v3, 0, r3 +; P8LE-NEXT: addi r4, r4, .LCPI50_0@toc@l +; P8LE-NEXT: lxvd2x vs0, 0, r4 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: lxvd2x vs0, 0, r3 -; P8LE-NEXT: xxswapd v3, vs0 -; P8LE-NEXT: vperm v2, v2, v2, v3 +; P8LE-NEXT: vperm v2, v3, v3, v2 ; P8LE-NEXT: xvcvspuxws v2, v2 ; P8LE-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll --- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll @@ -183,14 +183,13 @@ define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_none_v16i8: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI2_0@toc@ha +; CHECK-LE-P8-NEXT: lxvd2x v3, 0, r4 ; CHECK-LE-P8-NEXT: mtvsrd v4, r3 -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI2_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v16i8: @@ -431,14 +430,13 @@ define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_none_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI5_0@toc@ha +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI5_0@toc@ha +; CHECK-LE-P8-NEXT: lxvd2x v3, 0, r4 ; CHECK-LE-P8-NEXT: mtvsrd v4, r3 -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI5_0@toc@l +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI5_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v8i16: diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll --- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll @@ -469,19 +469,18 @@ define void @test_none_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) { ; CHECK-LE-P8-LABEL: test_none_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI4_0@toc@ha -; CHECK-LE-P8-NEXT: lxsdx v4, 0, r3 +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI4_0@toc@ha +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r3 ; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI4_1@toc@ha -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI4_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x v4, 0, r4 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI4_0@toc@l ; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI4_1@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs1, 0, r4 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, vs1 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v3, v4, v2 ; CHECK-LE-P8-NEXT: xxlxor v4, v4, v4 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 ; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 ; CHECK-LE-P8-NEXT: xxswapd vs0, v2 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 diff --git a/llvm/test/CodeGen/PowerPC/vperm-swap.ll b/llvm/test/CodeGen/PowerPC/vperm-swap.ll --- a/llvm/test/CodeGen/PowerPC/vperm-swap.ll +++ b/llvm/test/CodeGen/PowerPC/vperm-swap.ll @@ -4,32 +4,31 @@ define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8: .LCPI0_0: -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 30 # 0x1e +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 22 # 0x16 ; CHECK-LE-P8-NEXT: .byte 7 # 0x7 -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f -; CHECK-LE-P8-NEXT: .byte 31 # 0x1f +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 +; CHECK-LE-P8-NEXT: .byte 23 # 0x17 ; CHECK-LE-P8-LABEL: test_none_v16i8: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; CHECK-LE-P8-NEXT: lxvd2x v3, 0, r4 ; CHECK-LE-P8-NEXT: mtvsrd v4, r3 -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI0_0@toc@l +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI0_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-LE-P8-NEXT: blr entry: %lhs = load <16 x i8>, ptr %b, align 4