Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1652,6 +1652,10 @@ return true; } + /// Return true (the default) if it is profitable to remove a sext_inreg(x) + /// where the sext is redundant, and use x directly. + virtual bool shouldRemoveRedundantExtend(SDValue Op) const { return true; } + /// When splitting a value of the specified type into parts, does the Lo /// or Hi part come first? This usually follows the endianness, except /// for ppcf128, where the Hi part always comes first. Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -804,7 +804,8 @@ SDValue Op0 = Op.getOperand(0); EVT ExVT = cast(Op.getOperand(1))->getVT(); unsigned ExBits = ExVT.getScalarSizeInBits(); - if (DemandedBits.getActiveBits() <= ExBits) + if (DemandedBits.getActiveBits() <= ExBits && + shouldRemoveRedundantExtend(Op)) return Op0; // If the input is already sign extended, just drop the extension. unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -618,6 +618,8 @@ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override; + bool shouldRemoveRedundantExtend(SDValue Op) const override; + bool isTruncateFree(Type *Ty1, Type *Ty2) const override; bool isTruncateFree(EVT VT1, EVT VT2) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13653,6 +13653,22 @@ return true; } +// Treat a sext_inreg(extract(..)) as free if it has multiple uses. +bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const { + EVT VT = Extend.getValueType(); + if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) { + SDValue Extract = Extend.getOperand(0); + if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse()) + Extract = Extract.getOperand(0); + if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) { + EVT VecVT = Extract.getOperand(0).getValueType(); + if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16) + return false; + } + } + return true; +} + // Truncations from 64-bit GPR to 32-bit GPR is free. bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) Index: llvm/test/CodeGen/AArch64/extract-sext-zext.ll =================================================================== --- llvm/test/CodeGen/AArch64/extract-sext-zext.ll +++ llvm/test/CodeGen/AArch64/extract-sext-zext.ll @@ -371,18 +371,11 @@ } define i32 @redundant_i16i32(<8 x i16> %x) { -; CHECK-ISEL-LABEL: redundant_i16i32: -; CHECK-ISEL: // %bb.0: -; CHECK-ISEL-NEXT: umov w8, v0.h[2] -; CHECK-ISEL-NEXT: smov w9, v0.h[2] -; CHECK-ISEL-NEXT: eor w0, w9, w8, lsl #16 -; CHECK-ISEL-NEXT: ret -; -; CHECK-GLOBAL-LABEL: redundant_i16i32: -; CHECK-GLOBAL: // %bb.0: -; CHECK-GLOBAL-NEXT: smov w8, v0.h[2] -; CHECK-GLOBAL-NEXT: eor w0, w8, w8, lsl #16 -; CHECK-GLOBAL-NEXT: ret +; CHECK-LABEL: redundant_i16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: smov w8, v0.h[2] +; CHECK-NEXT: eor w0, w8, w8, lsl #16 +; CHECK-NEXT: ret %e = extractelement <8 x i16> %x, i64 2 %s = sext i16 %e to i32 %t = shl i32 %s, 16 @@ -406,20 +399,12 @@ } define i32 @redundant_i8i32(<8 x i8> %x) { -; CHECK-ISEL-LABEL: redundant_i8i32: -; CHECK-ISEL: // %bb.0: -; CHECK-ISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-ISEL-NEXT: umov w8, v0.b[2] -; CHECK-ISEL-NEXT: smov w9, v0.b[2] -; CHECK-ISEL-NEXT: eor w0, w9, w8, lsl #24 -; CHECK-ISEL-NEXT: ret -; -; CHECK-GLOBAL-LABEL: redundant_i8i32: -; CHECK-GLOBAL: // %bb.0: -; CHECK-GLOBAL-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GLOBAL-NEXT: smov w8, v0.b[2] -; CHECK-GLOBAL-NEXT: eor w0, w8, w8, lsl #24 -; CHECK-GLOBAL-NEXT: ret +; CHECK-LABEL: redundant_i8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.b[2] +; CHECK-NEXT: eor w0, w8, w8, lsl #24 +; CHECK-NEXT: ret %e = extractelement <8 x i8> %x, i64 2 %s = sext i8 %e to i32 %t = shl i32 %s, 24 @@ -469,18 +454,11 @@ } define i64 @redundant_i16i64(<8 x i16> %x) { -; CHECK-ISEL-LABEL: redundant_i16i64: -; CHECK-ISEL: // %bb.0: -; CHECK-ISEL-NEXT: umov w8, v0.h[2] -; CHECK-ISEL-NEXT: smov x9, v0.h[2] -; CHECK-ISEL-NEXT: eor x0, x9, x8, lsl #48 -; CHECK-ISEL-NEXT: ret -; -; CHECK-GLOBAL-LABEL: redundant_i16i64: -; CHECK-GLOBAL: // %bb.0: -; CHECK-GLOBAL-NEXT: smov x8, v0.h[2] -; CHECK-GLOBAL-NEXT: eor x0, x8, x8, lsl #48 -; CHECK-GLOBAL-NEXT: ret +; CHECK-LABEL: redundant_i16i64: +; CHECK: // %bb.0: +; CHECK-NEXT: smov x8, v0.h[2] +; CHECK-NEXT: eor x0, x8, x8, lsl #48 +; CHECK-NEXT: ret %e = extractelement <8 x i16> %x, i64 2 %s = sext i16 %e to i64 %t = shl i64 %s, 48 @@ -504,20 +482,12 @@ } define i64 @redundant_i8i64(<8 x i8> %x) { -; CHECK-ISEL-LABEL: redundant_i8i64: -; CHECK-ISEL: // %bb.0: -; CHECK-ISEL-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-ISEL-NEXT: umov w8, v0.b[2] -; CHECK-ISEL-NEXT: smov x9, v0.b[2] -; CHECK-ISEL-NEXT: eor x0, x9, x8, lsl #56 -; CHECK-ISEL-NEXT: ret -; -; CHECK-GLOBAL-LABEL: redundant_i8i64: -; CHECK-GLOBAL: // %bb.0: -; CHECK-GLOBAL-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GLOBAL-NEXT: smov x8, v0.b[2] -; CHECK-GLOBAL-NEXT: eor x0, x8, x8, lsl #56 -; CHECK-GLOBAL-NEXT: ret +; CHECK-LABEL: redundant_i8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov x8, v0.b[2] +; CHECK-NEXT: eor x0, x8, x8, lsl #56 +; CHECK-NEXT: ret %e = extractelement <8 x i8> %x, i64 2 %s = sext i8 %e to i64 %t = shl i64 %s, 56 Index: llvm/test/CodeGen/AArch64/srem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -106,47 +106,44 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[0] ; CHECK-NEXT: mov w8, #37253 -; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smov w14, v0.h[3] -; CHECK-NEXT: mov w12, #95 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: mov w14, #95 ; CHECK-NEXT: smull x13, w9, w8 ; CHECK-NEXT: smull x15, w10, w8 ; CHECK-NEXT: lsr x13, x13, #32 ; CHECK-NEXT: smull x16, w11, w8 -; CHECK-NEXT: lsr x15, x15, #32 ; CHECK-NEXT: add w13, w13, w9 -; CHECK-NEXT: add w10, w15, w10 -; CHECK-NEXT: asr w15, w13, #6 -; CHECK-NEXT: add w13, w15, w13, lsr #31 -; CHECK-NEXT: umov w15, v0.h[1] -; CHECK-NEXT: smull x8, w14, w8 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: asr w17, w13, #6 +; CHECK-NEXT: add w15, w15, w10 +; CHECK-NEXT: add w13, w17, w13, lsr #31 +; CHECK-NEXT: asr w17, w15, #6 +; CHECK-NEXT: add w15, w17, w15, lsr #31 +; CHECK-NEXT: smull x8, w12, w8 +; CHECK-NEXT: msub w9, w13, w14, w9 ; CHECK-NEXT: lsr x16, x16, #32 -; CHECK-NEXT: add w11, w16, w11 -; CHECK-NEXT: asr w16, w10, #6 -; CHECK-NEXT: msub w9, w13, w12, w9 -; CHECK-NEXT: add w10, w16, w10, lsr #31 +; CHECK-NEXT: add w16, w16, w11 +; CHECK-NEXT: msub w10, w15, w14, w10 +; CHECK-NEXT: asr w17, w16, #6 ; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: umov w16, v0.h[2] -; CHECK-NEXT: add w8, w8, w14 -; CHECK-NEXT: asr w14, w11, #6 -; CHECK-NEXT: add w11, w14, w11, lsr #31 -; CHECK-NEXT: msub w14, w10, w12, w15 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: fmov s0, w13 -; CHECK-NEXT: asr w13, w8, #6 -; CHECK-NEXT: add w8, w13, w8, lsr #31 -; CHECK-NEXT: msub w13, w11, w12, w16 -; CHECK-NEXT: mov v1.h[1], w14 +; CHECK-NEXT: fmov s1, w13 +; CHECK-NEXT: add w16, w17, w16, lsr #31 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: add w8, w8, w12 +; CHECK-NEXT: asr w9, w8, #6 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: msub w9, w16, w14, w11 ; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: msub w9, w8, w12, w9 -; CHECK-NEXT: mov v1.h[2], w13 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: mov v1.h[3], w9 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: mov v1.h[1], w15 +; CHECK-NEXT: msub w10, w8, w14, w12 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: mov v1.h[2], w16 +; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x,