Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -4450,6 +4450,13 @@ return Op; } +static bool isZeroVector(SDValue N) { + if (N->getOpcode() == ISD::SPLAT_VECTOR) + if (auto *Op = dyn_cast(N->getOperand(0))) + return Op->getZExtValue() == 0; + return ISD::isBuildVectorAllZeros(N.getNode()); +} + // Bytes is a VPERM-like permute vector, except that -1 is used for // undefined bytes. Implement it on operands Ops[0] and Ops[1] using // VSLDB or VPERM. @@ -4466,7 +4473,54 @@ Ops[OpNo1], DAG.getTargetConstant(StartIndex, DL, MVT::i32)); - // Fall back on VPERM. Construct an SDNode for the permute vector. + // Fall back on VPERM. Construct an SDNode for the permute vector. Try to + // eliminate a zero vector by reusing any zero index in the permute vector. + unsigned ZeroVecIdx = + isZeroVector(Ops[0]) ? 0 : (isZeroVector(Ops[1]) ? 1 : UINT_MAX); + if (ZeroVecIdx != UINT_MAX) { + bool MaskFirst = true; + int ZeroIdx = -1; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes; + unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes; + if (OpNo == ZeroVecIdx && I == 0) { + // If the first byte is zero, use mask as first operand. + ZeroIdx = 0; + break; + } + if (OpNo != ZeroVecIdx && Byte == 0) { + // If mask contains a zero, use it by placing that vector first. + ZeroIdx = I + SystemZ::VectorBytes; + MaskFirst = false; + break; + } + } + if (ZeroIdx != -1) { + SDValue IndexNodes[SystemZ::VectorBytes]; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + if (Bytes[I] >= 0) { + unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes; + unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes; + if (OpNo == ZeroVecIdx) + IndexNodes[I] = DAG.getConstant(ZeroIdx, DL, MVT::i32); + else { + unsigned BIdx = MaskFirst ? Byte + SystemZ::VectorBytes : Byte; + IndexNodes[I] = DAG.getConstant(BIdx, DL, MVT::i32); + } + } else + IndexNodes[I] = DAG.getUNDEF(MVT::i32); + } + SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes); + SDValue Src = ZeroVecIdx == 0 ? Ops[1] : Ops[0]; + if (MaskFirst) + return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Mask, Src, + Mask); + else + return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Src, Mask, + Mask); + } + } + SDValue IndexNodes[SystemZ::VectorBytes]; for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) if (Bytes[I] >= 0) Index: llvm/test/CodeGen/SystemZ/vec-perm-14.ll =================================================================== --- llvm/test/CodeGen/SystemZ/vec-perm-14.ll +++ llvm/test/CodeGen/SystemZ/vec-perm-14.ll @@ -3,7 +3,7 @@ ; Test that only one vperm of the vector compare is needed for both extracts. define void @fun() { -; CHECK-LABEL: fun +; CHECK-LABEL: fun: ; CHECK: vperm ; CHECK-NOT: vperm bb: @@ -25,3 +25,74 @@ bb4: unreachable } + +; Test that a zero index in the permute vector is used instead of VGBM, with +; a zero index into the other source operand. +define <4 x i8> @fun1(<2 x i8> %arg) { +; CHECK-LABEL:.LCPI1_0: +; CHECK-NEXT: .byte 1 # 0x1 +; CHECK-NEXT: .byte 18 # 0x12 +; CHECK-NEXT: .byte 0 # 0x0 +; CHECK-NEXT: .byte 18 # 0x12 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .text +; CHECK-NEXT: .globl fun1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .type fun1,@function +; CHECK-NEXT: fun1: # @fun1 +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v24, %v0, %v0 +; CHECK-NEXT: br %r14 + %res = shufflevector <2 x i8> %arg, <2 x i8> zeroinitializer, + <4 x i32> + ret <4 x i8> %res +} + +; Same, but with the first byte indexing into an element of the zero vector. +define <4 x i8> @fun2(<2 x i8> %arg) { +; CHECK-LABEL:.LCPI2_0: +; CHECK-NEXT: .byte 0 # 0x0 +; CHECK-NEXT: .byte 17 # 0x11 +; CHECK-NEXT: .byte 17 # 0x11 +; CHECK-NEXT: .byte 0 # 0x0 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .text +; CHECK-NEXT: .globl fun2 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .type fun2,@function +; CHECK-NEXT:fun2: # @fun2 +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT:# %bb.0: +; CHECK-NEXT: larl %r1, .LCPI2_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v24, %v0 +; CHECK-NEXT: br %r14 + %res = shufflevector <2 x i8> %arg, <2 x i8> zeroinitializer, + <4 x i32> + ret <4 x i8> %res +}