Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -4450,6 +4450,13 @@ return Op; } +static bool isZeroOrUndefVector(SDValue N) { + if (N->getOpcode() == ISD::SPLAT_VECTOR) + if (auto *Op = dyn_cast(N->getOperand(0))) + return Op->getZExtValue() == 0; + return N.isUndef() || ISD::isBuildVectorAllZeros(N.getNode()); +} + // Bytes is a VPERM-like permute vector, except that -1 is used for // undefined bytes. Implement it on operands Ops[0] and Ops[1] using // VSLDB or VPERM. @@ -4466,7 +4473,54 @@ Ops[OpNo1], DAG.getTargetConstant(StartIndex, DL, MVT::i32)); - // Fall back on VPERM. Construct an SDNode for the permute vector. + // Fall back on VPERM. Construct an SDNode for the permute vector. Try to + // eliminate a zero vector by reusing any zero index in the permute vector. + unsigned ZeroVecIdx = isZeroOrUndefVector(Ops[0]) ? 0 + : (isZeroOrUndefVector(Ops[1]) ? 1 : UINT_MAX); + if (ZeroVecIdx != UINT_MAX) { + bool MaskFirst = true; + int ZeroIdx = -1; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes; + unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes; + if (OpNo == ZeroVecIdx && I == 0) { + // If the first byte is zero, use mask as first operand. + ZeroIdx = 0; + break; + } + if (OpNo != ZeroVecIdx && Byte == 0) { + // If mask contains a zero, use it by placing that vector first. + ZeroIdx = I + SystemZ::VectorBytes; + MaskFirst = false; + break; + } + } + if (ZeroIdx != -1) { + SDValue IndexNodes[SystemZ::VectorBytes]; + for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) { + if (Bytes[I] >= 0) { + unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes; + unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes; + if (OpNo == ZeroVecIdx) + IndexNodes[I] = DAG.getConstant(ZeroIdx, DL, MVT::i32); + else { + unsigned BIdx = MaskFirst ? Byte + SystemZ::VectorBytes : Byte; + IndexNodes[I] = DAG.getConstant(BIdx, DL, MVT::i32); + } + } else + IndexNodes[I] = DAG.getUNDEF(MVT::i32); + } + SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes); + SDValue Src = ZeroVecIdx == 0 ? Ops[1] : Ops[0]; + if (MaskFirst) + return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Mask, Src, + Mask); + else + return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Src, Mask, + Mask); + } + } + SDValue IndexNodes[SystemZ::VectorBytes]; for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) if (Bytes[I] >= 0) Index: llvm/test/CodeGen/SystemZ/vec-perm-14.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/vec-perm-14.ll @@ -0,0 +1,69 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s +; +; Test that a zero index in the permute vector is used instead of VGBM. + +define <4 x i8> @fun1(<2 x i8> %arg) { +; CHECK-LABEL:.LCPI0_0: +; CHECK-NEXT: .byte 1 # 0x1 +; CHECK-NEXT: .byte 18 # 0x12 +; CHECK-NEXT: .byte 0 # 0x0 +; CHECK-NEXT: .byte 18 # 0x12 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .text +; CHECK-NEXT: .globl fun1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .type fun1,@function +; CHECK-NEXT: fun1: # @fun1 +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v24, %v0, %v0 +; CHECK-NEXT: br %r14 + %res = shufflevector <2 x i8> %arg, <2 x i8> zeroinitializer, <4 x i32> + ret <4 x i8> %res +} + +define <4 x i8> @fun2(<2 x i8> %arg) { +; CHECK-LABEL:.LCPI1_0: +; CHECK-NEXT: .byte 0 # 0x0 +; CHECK-NEXT: .byte 17 # 0x11 +; CHECK-NEXT: .byte 16 # 0x10 +; CHECK-NEXT: .byte 0 # 0x0 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .space 1 +; CHECK-NEXT: .text +; CHECK-NEXT: .globl fun2 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .type fun2,@function +; CHECK-NEXT:fun2: # @fun2 +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT:# %bb.0: +; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v24, %v0 +; CHECK-NEXT: br %r14 + %res = shufflevector <2 x i8> %arg, <2 x i8> zeroinitializer, <4 x i32> + ret <4 x i8> %res +} Index: llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll =================================================================== --- llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll +++ llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll @@ -5377,12 +5377,12 @@ ; SZ13-LABEL: constrained_vector_fptrunc_v3f64: ; SZ13: # %bb.0: # %entry ; SZ13-NEXT: vl %v1, 0(%r2), 4 +; SZ13-NEXT: ld %f0, 16(%r2) ; SZ13-NEXT: vledb %v1, %v1, 0, 0 ; SZ13-NEXT: larl %r1, .LCPI97_0 -; SZ13-NEXT: ld %f0, 16(%r2) -; SZ13-NEXT: vl %v2, 0(%r1), 3 -; SZ13-NEXT: vperm %v1, %v1, %v0, %v2 ; SZ13-NEXT: ledbra %f0, 0, %f0, 0 +; SZ13-NEXT: vl %v2, 0(%r1), 3 +; SZ13-NEXT: vperm %v1, %v1, %v2, %v2 ; SZ13-NEXT: ste %f0, 8(%r3) ; SZ13-NEXT: vsteg %v1, 0(%r3), 0 ; SZ13-NEXT: br %r14