Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -191,6 +191,8 @@ setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::OR, VT, Custom); } // Neon does not support vector divide/remainder operations. @@ -254,6 +256,8 @@ setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); @@ -4966,6 +4970,216 @@ return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); } +/// isVMOVModifiedImm - Check if the specified splat value corresponds to a +/// valid vector constant for a NEON or MVE instruction with a "modified +/// immediate" operand (e.g., VMOV). If so, return the encoded value. +static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, + unsigned SplatBitSize, SelectionDAG &DAG, + const SDLoc &dl, EVT &VT, EVT VectorVT, + VMOVModImmType type) { + unsigned OpCmode, Imm; + bool is128Bits = VectorVT.is128BitVector(); + + // SplatBitSize is set to the smallest size that splats the vector, so a + // zero vector will always have SplatBitSize == 8. However, NEON modified + // immediate instructions others than VMOV do not support the 8-bit encoding + // of a zero vector, and the default encoding of zero is supposed to be the + // 32-bit version. + if (SplatBits == 0) + SplatBitSize = 32; + + switch (SplatBitSize) { + case 8: + if (type != VMOVModImm) + return SDValue(); + // Any 1-byte value is OK. Op=0, Cmode=1110. + assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); + OpCmode = 0xe; + Imm = SplatBits; + VT = is128Bits ? MVT::v16i8 : MVT::v8i8; + break; + + case 16: + // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. + VT = is128Bits ? MVT::v8i16 : MVT::v4i16; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x00nn: Op=x, Cmode=100x. + OpCmode = 0x8; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0xnn00: Op=x, Cmode=101x. + OpCmode = 0xa; + Imm = SplatBits >> 8; + break; + } + return SDValue(); + + case 32: + // NEON's 32-bit VMOV supports splat values where: + // * only one byte is nonzero, or + // * the least significant byte is 0xff and the second byte is nonzero, or + // * the least significant 2 bytes are 0xff and the third is nonzero. + VT = is128Bits ? MVT::v4i32 : MVT::v2i32; + if ((SplatBits & ~0xff) == 0) { + // Value = 0x000000nn: Op=x, Cmode=000x. + OpCmode = 0; + Imm = SplatBits; + break; + } + if ((SplatBits & ~0xff00) == 0) { + // Value = 0x0000nn00: Op=x, Cmode=001x. + OpCmode = 0x2; + Imm = SplatBits >> 8; + break; + } + if ((SplatBits & ~0xff0000) == 0) { + // Value = 0x00nn0000: Op=x, Cmode=010x. + OpCmode = 0x4; + Imm = SplatBits >> 16; + break; + } + if ((SplatBits & ~0xff000000) == 0) { + // Value = 0xnn000000: Op=x, Cmode=011x. + OpCmode = 0x6; + Imm = SplatBits >> 24; + break; + } + + // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC + if (type == OtherModImm) return SDValue(); + + if ((SplatBits & ~0xffff) == 0 && + ((SplatBits | SplatUndef) & 0xff) == 0xff) { + // Value = 0x0000nnff: Op=x, Cmode=1100. + OpCmode = 0xc; + Imm = SplatBits >> 8; + break; + } + + // cmode == 0b1101 is not supported for MVE VMVN + if (type == MVEVMVNModImm) + return SDValue(); + + if ((SplatBits & ~0xffffff) == 0 && + ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { + // Value = 0x00nnffff: Op=x, Cmode=1101. + OpCmode = 0xd; + Imm = SplatBits >> 16; + break; + } + + // Note: there are a few 32-bit splat values (specifically: 00ffff00, + // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not + // VMOV.I32. A (very) minor optimization would be to replicate the value + // and fall through here to test for a valid 64-bit splat. But, then the + // caller would also need to check and handle the change in size. + return SDValue(); + + case 64: { + if (type != VMOVModImm) + return SDValue(); + // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. + uint64_t BitMask = 0xff; + unsigned ImmMask = 1; + Imm = 0; + for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { + if (((SplatBits | SplatUndef) & BitMask) == BitMask) { + Imm |= ImmMask; + } else if ((SplatBits & BitMask) != 0) { + return SDValue(); + } + BitMask <<= 8; + ImmMask <<= 1; + } + + if (DAG.getDataLayout().isBigEndian()) { + // Reverse the order of elements within the vector. + unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8; + unsigned Mask = (1 << BytesPerElem) - 1; + unsigned NumElems = 8 / BytesPerElem; + unsigned NewImm = 0; + for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) { + unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask); + NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem; + } + Imm = NewImm; + } + + // Op=1, Cmode=1110. + OpCmode = 0x1e; + VT = is128Bits ? MVT::v2i64 : MVT::v1i64; + break; + } + + default: + llvm_unreachable("unexpected size for isVMOVModifiedImm"); + } + + unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); + return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); +} + +// Custom lower AND(X, C) -> VBICIMM +static SDValue LowerAND(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + BuildVectorSDNode *BVN = dyn_cast(Op.getOperand(1)); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || + SplatBitSize == 64) { + EVT VBICVT; + SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, DL, VBICVT, VT, OtherModImm); + if (Val.getNode()) { + SDValue Input = DAG.getNode(ISD::BITCAST, DL, VBICVT, Op.getOperand(0)); + SDValue VBIC = DAG.getNode(ARMISD::VBICIMM, DL, VBICVT, Input, Val); + return DAG.getNode(ISD::BITCAST, DL, VT, VBIC); + } + } + } + + return Op; +} + +// Custom lower AND(X, C) -> VORRIMM +static SDValue LowerOR(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + BuildVectorSDNode *BVN = dyn_cast(Op.getOperand(1)); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || + SplatBitSize == 64) { + EVT VorrVT; + SDValue Val = + isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG, DL, VorrVT, VT, OtherModImm); + if (Val.getNode()) { + SDValue Input = + DAG.getNode(ISD::BITCAST, DL, VorrVT, Op.getOperand(0)); + SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, DL, VorrVT, Input, Val); + return DAG.getNode(ISD::BITCAST, DL, VT, Vorr); + } + } + } + + return Op; +} + SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); @@ -6745,157 +6959,6 @@ CCR, Chain.getValue(1)); } -/// isVMOVModifiedImm - Check if the specified splat value corresponds to a -/// valid vector constant for a NEON or MVE instruction with a "modified -/// immediate" operand (e.g., VMOV). If so, return the encoded value. -static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, - unsigned SplatBitSize, SelectionDAG &DAG, - const SDLoc &dl, EVT &VT, EVT VectorVT, - VMOVModImmType type) { - unsigned OpCmode, Imm; - bool is128Bits = VectorVT.is128BitVector(); - - // SplatBitSize is set to the smallest size that splats the vector, so a - // zero vector will always have SplatBitSize == 8. However, NEON modified - // immediate instructions others than VMOV do not support the 8-bit encoding - // of a zero vector, and the default encoding of zero is supposed to be the - // 32-bit version. - if (SplatBits == 0) - SplatBitSize = 32; - - switch (SplatBitSize) { - case 8: - if (type != VMOVModImm) - return SDValue(); - // Any 1-byte value is OK. Op=0, Cmode=1110. - assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); - OpCmode = 0xe; - Imm = SplatBits; - VT = is128Bits ? MVT::v16i8 : MVT::v8i8; - break; - - case 16: - // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. - VT = is128Bits ? MVT::v8i16 : MVT::v4i16; - if ((SplatBits & ~0xff) == 0) { - // Value = 0x00nn: Op=x, Cmode=100x. - OpCmode = 0x8; - Imm = SplatBits; - break; - } - if ((SplatBits & ~0xff00) == 0) { - // Value = 0xnn00: Op=x, Cmode=101x. - OpCmode = 0xa; - Imm = SplatBits >> 8; - break; - } - return SDValue(); - - case 32: - // NEON's 32-bit VMOV supports splat values where: - // * only one byte is nonzero, or - // * the least significant byte is 0xff and the second byte is nonzero, or - // * the least significant 2 bytes are 0xff and the third is nonzero. - VT = is128Bits ? MVT::v4i32 : MVT::v2i32; - if ((SplatBits & ~0xff) == 0) { - // Value = 0x000000nn: Op=x, Cmode=000x. - OpCmode = 0; - Imm = SplatBits; - break; - } - if ((SplatBits & ~0xff00) == 0) { - // Value = 0x0000nn00: Op=x, Cmode=001x. - OpCmode = 0x2; - Imm = SplatBits >> 8; - break; - } - if ((SplatBits & ~0xff0000) == 0) { - // Value = 0x00nn0000: Op=x, Cmode=010x. - OpCmode = 0x4; - Imm = SplatBits >> 16; - break; - } - if ((SplatBits & ~0xff000000) == 0) { - // Value = 0xnn000000: Op=x, Cmode=011x. - OpCmode = 0x6; - Imm = SplatBits >> 24; - break; - } - - // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC - if (type == OtherModImm) return SDValue(); - - if ((SplatBits & ~0xffff) == 0 && - ((SplatBits | SplatUndef) & 0xff) == 0xff) { - // Value = 0x0000nnff: Op=x, Cmode=1100. - OpCmode = 0xc; - Imm = SplatBits >> 8; - break; - } - - // cmode == 0b1101 is not supported for MVE VMVN - if (type == MVEVMVNModImm) - return SDValue(); - - if ((SplatBits & ~0xffffff) == 0 && - ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { - // Value = 0x00nnffff: Op=x, Cmode=1101. - OpCmode = 0xd; - Imm = SplatBits >> 16; - break; - } - - // Note: there are a few 32-bit splat values (specifically: 00ffff00, - // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not - // VMOV.I32. A (very) minor optimization would be to replicate the value - // and fall through here to test for a valid 64-bit splat. But, then the - // caller would also need to check and handle the change in size. - return SDValue(); - - case 64: { - if (type != VMOVModImm) - return SDValue(); - // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. - uint64_t BitMask = 0xff; - unsigned ImmMask = 1; - Imm = 0; - for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { - if (((SplatBits | SplatUndef) & BitMask) == BitMask) { - Imm |= ImmMask; - } else if ((SplatBits & BitMask) != 0) { - return SDValue(); - } - BitMask <<= 8; - ImmMask <<= 1; - } - - if (DAG.getDataLayout().isBigEndian()) { - // Reverse the order of elements within the vector. - unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8; - unsigned Mask = (1 << BytesPerElem) - 1; - unsigned NumElems = 8 / BytesPerElem; - unsigned NewImm = 0; - for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) { - unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask); - NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem; - } - Imm = NewImm; - } - - // Op=1, Cmode=1110. - OpCmode = 0x1e; - VT = is128Bits ? MVT::v2i64 : MVT::v1i64; - break; - } - - default: - llvm_unreachable("unexpected size for isVMOVModifiedImm"); - } - - unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); - return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); -} - SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { EVT VT = Op.getValueType(); @@ -10119,6 +10182,10 @@ case ISD::SADDSAT: case ISD::SSUBSAT: return LowerSADDSUBSAT(Op, DAG, Subtarget); + case ISD::AND: + return LowerAND(Op, DAG, Subtarget); + case ISD::OR: + return LowerOR(Op, DAG, Subtarget); case ISD::LOAD: return LowerPredicateLoad(Op, DAG); case ISD::STORE: @@ -13567,8 +13634,6 @@ static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - // Attempt to use immediate-form VBIC - BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; @@ -13577,26 +13642,6 @@ VT == MVT::v8i1 || VT == MVT::v16i1) return SDValue(); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && - BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || - SplatBitSize == 64) { - EVT VbicVT; - SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), - SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VbicVT, VT, OtherModImm); - if (Val.getNode()) { - SDValue Input = - DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); - SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); - return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); - } - } - } - if (!Subtarget->isThumb1Only()) { // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) @@ -13861,8 +13906,6 @@ static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - // Attempt to use immediate-form VORR - BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; @@ -13874,26 +13917,6 @@ (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) return PerformORCombine_i1(N, DCI, Subtarget); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && - BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || - SplatBitSize == 64) { - EVT VorrVT; - SDValue Val = - isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), - SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm); - if (Val.getNode()) { - SDValue Input = - DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); - SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); - return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); - } - } - } - if (!Subtarget->isThumb1Only()) { // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1311,7 +1311,23 @@ { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost}, { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost}, { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost}, - // Multiplication. + // Bitwise operations are custom but still cheap + { ISD::AND, MVT::v8i8, 1}, + { ISD::OR, MVT::v8i8, 1}, + { ISD::AND, MVT::v4i16, 1}, + { ISD::OR, MVT::v4i16, 1}, + { ISD::AND, MVT::v2i32, 1}, + { ISD::OR, MVT::v2i32, 1}, + { ISD::AND, MVT::v1i64, 1}, + { ISD::OR, MVT::v1i64, 1}, + { ISD::AND, MVT::v16i8, 1}, + { ISD::OR, MVT::v16i8, 1}, + { ISD::AND, MVT::v8i16, 1}, + { ISD::OR, MVT::v8i16, 1}, + { ISD::AND, MVT::v4i32, 1}, + { ISD::OR, MVT::v4i32, 1}, + { ISD::AND, MVT::v2i64, 1}, + { ISD::OR, MVT::v2i64, 1}, }; if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) Index: llvm/test/Analysis/CostModel/ARM/arith-overflow.ll =================================================================== --- llvm/test/Analysis/CostModel/ARM/arith-overflow.ll +++ llvm/test/Analysis/CostModel/ARM/arith-overflow.ll @@ -115,11 +115,11 @@ ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; MVE-SIZE-LABEL: 'sadd' @@ -431,11 +431,11 @@ ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; MVE-SIZE-LABEL: 'ssub' Index: llvm/test/Analysis/CostModel/ARM/arith-ssat.ll =================================================================== --- llvm/test/Analysis/CostModel/ARM/arith-ssat.ll +++ llvm/test/Analysis/CostModel/ARM/arith-ssat.ll @@ -148,14 +148,14 @@ ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; MVE-SIZE-LABEL: 'add' @@ -354,14 +354,14 @@ ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef) -; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef) +; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef) ; NEON-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; MVE-SIZE-LABEL: 'sub' Index: llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -3204,11 +3204,8 @@ ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 ; CHECK-NEXT: vmullt.u8 q2, q1, q0 ; CHECK-NEXT: vmullb.u8 q0, q1, q0 -; CHECK-NEXT: vqshrnb.u16 q2, q2, #7 ; CHECK-NEXT: vqshrnb.u16 q0, q0, #7 -; CHECK-NEXT: vmovlb.u8 q2, q2 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vqshrnt.u16 q0, q2, #7 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB21_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block Index: llvm/test/CodeGen/Thumb2/mve-vqshl.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vqshl.ll +++ llvm/test/CodeGen/Thumb2/mve-vqshl.ll @@ -725,13 +725,10 @@ ; CHECK-NEXT: vmovlt.u8 q3, q0 ; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vshl.u16 q2, q3, q2 ; CHECK-NEXT: vshl.u16 q0, q0, q1 -; CHECK-NEXT: vqmovnb.u16 q2, q2 +; CHECK-NEXT: vshl.u16 q2, q3, q2 ; CHECK-NEXT: vqmovnb.u16 q0, q0 -; CHECK-NEXT: vmovlb.u8 q2, q2 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vqmovnt.u16 q0, q2 ; CHECK-NEXT: bx lr entry: %e0 = zext <16 x i8> %s0 to <16 x i16> @@ -1510,11 +1507,8 @@ ; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vshl.u16 q1, r0 ; CHECK-NEXT: vshl.u16 q0, r0 -; CHECK-NEXT: vqmovnb.u16 q1, q1 ; CHECK-NEXT: vqmovnb.u16 q0, q0 -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmovnt.i16 q0, q1 +; CHECK-NEXT: vqmovnt.u16 q0, q1 ; CHECK-NEXT: bx lr entry: %e0 = zext <16 x i8> %s0 to <16 x i16> @@ -2045,13 +2039,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovlt.u8 q1, q0 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vshl.i16 q1, q1, #3 ; CHECK-NEXT: vshl.i16 q0, q0, #3 -; CHECK-NEXT: vqmovnb.u16 q1, q1 +; CHECK-NEXT: vshl.i16 q1, q1, #3 ; CHECK-NEXT: vqmovnb.u16 q0, q0 -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vmovnt.i16 q0, q1 +; CHECK-NEXT: vqmovnt.u16 q0, q1 ; CHECK-NEXT: bx lr entry: %e0 = zext <16 x i8> %s0 to <16 x i16>