diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -888,7 +888,7 @@ setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND, ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR, - ISD::INSERT_SUBVECTOR, ISD::STORE}); + ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR}); if (Subtarget->supportsAddressTopByteIgnored()) setTargetDAGCombine(ISD::LOAD); @@ -16031,6 +16031,49 @@ return SDValue(); } +static SDValue performBuildVectorCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + SDLoc DL(N); + + // A build vector of two extracted elements is equivalent to an + // extract subvector where the inner vector is any-extended to the + // extract_vector_elt VT. + // (build_vector (extract_elt_iXX_to_i32 vec Idx+0) + // (extract_elt_iXX_to_i32 vec Idx+1)) + // => (extract_subvector (anyext_iXX_to_i32 vec) Idx) + + // For now, only consider the v2i32 case, which arises as a result of + // legalization. + if (N->getValueType(0) != MVT::v2i32) + return SDValue(); + + SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1); + // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT. + if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + // Constant index. + isa(Elt0->getOperand(1)) && + isa(Elt1->getOperand(1)) && + // Both EXTRACT_VECTOR_ELT from same vector... + Elt0->getOperand(0) == Elt1->getOperand(0) && + // ... and contiguous. First element's index +1 == second element's index. + Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1)) { + SDValue VecToExtend = Elt0->getOperand(0); + EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32); + if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT)) + return SDValue(); + + SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL); + + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext, + SubvectorIdx); + } + + return SDValue(); +} + static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -19500,6 +19543,8 @@ case ISD::ADD: case ISD::SUB: return performAddSubCombine(N, DCI, DAG); + case ISD::BUILD_VECTOR: + return performBuildVectorCombine(N, DCI, DAG); case AArch64ISD::ANDS: return performFlagSettingCombine(N, DCI, ISD::AND); case AArch64ISD::ADC: diff --git a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll --- a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll @@ -6,11 +6,7 @@ define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) { ; CHECK-LABEL: bitcast_v2i16_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %y = bitcast <2 x half> %x to <2 x i16> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -101,11 +101,8 @@ define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -575,10 +575,7 @@ ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq v1.4h, v1.4h, #0.0 -; CHECK-NEXT: umov w8, v1.h[0] -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-NEXT: sshr v1.2s, v1.2s, #16 ; CHECK-NEXT: fmov w8, s1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -17,10 +17,7 @@ ; CHECK-NEXT: ldr s2, [x1] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: fcmeq v1.4h, v1.4h, v2.4h -; CHECK-NEXT: umov w8, v1.h[0] -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-NEXT: sshr v1.2s, v1.2s, #16 ; CHECK-NEXT: fmov w8, s1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -538,10 +538,7 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0 ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: umov w8, v2.h[0] -; CHECK-NEXT: umov w9, v2.h[1] -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov v2.s[1], w9 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-NEXT: shl v2.2s, v2.2s, #16 ; CHECK-NEXT: sshr v2.2s, v2.2s, #16 ; CHECK-NEXT: fmov w8, s2 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -17,10 +17,7 @@ ; CHECK-NEXT: ldr s2, [x1] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: fcmeq v2.4h, v1.4h, v2.4h -; CHECK-NEXT: umov w8, v2.h[0] -; CHECK-NEXT: umov w9, v2.h[1] -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov v2.s[1], w9 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-NEXT: shl v2.2s, v2.2s, #16 ; CHECK-NEXT: sshr v2.2s, v2.2s, #16 ; CHECK-NEXT: fmov w8, s2 diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll --- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll @@ -194,23 +194,16 @@ define <4 x double> @sitofp_v4i8_double(<4 x i8> %a) { ; CHECK-LABEL: sitofp_v4i8_double: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: shl v0.2s, v0.2s, #24 -; CHECK-NEXT: sshr v1.2s, v1.2s, #24 ; CHECK-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-NEXT: shl v1.2s, v1.2s, #24 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-NEXT: sshr v1.2s, v1.2s, #24 +; CHECK-NEXT: scvtf v0.2d, v0.2d ; CHECK-NEXT: sshll v1.2d, v1.2s, #0 -; CHECK-NEXT: sshll v2.2d, v0.2s, #0 -; CHECK-NEXT: scvtf v0.2d, v1.2d -; CHECK-NEXT: scvtf v1.2d, v2.2d +; CHECK-NEXT: scvtf v1.2d, v1.2d ; CHECK-NEXT: ret %1 = sitofp <4 x i8> %a to <4 x double> ret <4 x double> %1 @@ -333,39 +326,26 @@ define <8 x double> @sitofp_i16_double(<8 x i16> %a) { ; CHECK-LABEL: sitofp_i16_double: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[1] -; CHECK-NEXT: umov w10, v1.h[0] -; CHECK-NEXT: umov w12, v1.h[2] -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: fmov s3, w10 -; CHECK-NEXT: umov w10, v1.h[3] -; CHECK-NEXT: fmov s1, w12 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v2.s[1], w11 -; CHECK-NEXT: mov v3.s[1], w9 -; CHECK-NEXT: mov v1.s[1], w10 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: shl v2.2s, v2.2s, #16 -; CHECK-NEXT: sshr v0.2s, v0.2s, #16 -; CHECK-NEXT: shl v3.2s, v3.2s, #16 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: shl v2.2s, v1.2s, #16 +; CHECK-NEXT: shl v3.2s, v0.2s, #16 +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NEXT: sshr v2.2s, v2.2s, #16 -; CHECK-NEXT: sshll v4.2d, v0.2s, #0 -; CHECK-NEXT: sshr v0.2s, v3.2s, #16 -; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: sshr v3.2s, v3.2s, #16 ; CHECK-NEXT: sshll v2.2d, v2.2s, #0 -; CHECK-NEXT: sshll v3.2d, v0.2s, #0 -; CHECK-NEXT: sshll v5.2d, v1.2s, #0 -; CHECK-NEXT: scvtf v0.2d, v2.2d -; CHECK-NEXT: scvtf v1.2d, v4.2d -; CHECK-NEXT: scvtf v2.2d, v3.2d -; CHECK-NEXT: scvtf v3.2d, v5.2d +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: scvtf v2.2d, v2.2d +; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: sshll v3.2d, v3.2s, #0 +; CHECK-NEXT: sshll v4.2d, v1.2s, #0 +; CHECK-NEXT: sshll v1.2d, v0.2s, #0 +; CHECK-NEXT: scvtf v0.2d, v3.2d +; CHECK-NEXT: scvtf v1.2d, v1.2d +; CHECK-NEXT: scvtf v3.2d, v4.2d ; CHECK-NEXT: ret %1 = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %1 @@ -402,22 +382,15 @@ define <4 x double> @uitofp_v4i8_double(<4 x i8> %a) { ; CHECK-LABEL: uitofp_v4i8_double: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: movi d1, #0x0000ff000000ff -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v2.s[1], w10 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: and v2.8b, v2.8b, v1.8b +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: ushll v1.2d, v2.2s, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ucvtf v0.2d, v1.2d -; CHECK-NEXT: ucvtf v1.2d, v2.2d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: and v1.8b, v2.8b, v1.8b +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf v1.2d, v1.2d ; CHECK-NEXT: ret %1 = uitofp <4 x i8> %a to <4 x double> ret <4 x double> %1 @@ -530,36 +503,23 @@ define <8 x double> @uitofp_i16_double(<8 x i16> %a) { ; CHECK-LABEL: uitofp_i16_double: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[1] ; CHECK-NEXT: movi d1, #0x00ffff0000ffff -; CHECK-NEXT: umov w10, v2.h[0] -; CHECK-NEXT: umov w12, v2.h[2] -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: umov w9, v2.h[1] -; CHECK-NEXT: fmov s4, w10 -; CHECK-NEXT: umov w10, v2.h[3] -; CHECK-NEXT: fmov s2, w12 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v3.s[1], w11 -; CHECK-NEXT: mov v4.s[1], w9 -; CHECK-NEXT: mov v2.s[1], w10 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: and v3.8b, v3.8b, v1.8b -; CHECK-NEXT: ushll v5.2d, v0.2s, #0 -; CHECK-NEXT: and v0.8b, v4.8b, v1.8b -; CHECK-NEXT: and v1.8b, v2.8b, v1.8b +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: and v3.8b, v2.8b, v1.8b +; CHECK-NEXT: and v4.8b, v0.8b, v1.8b +; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ushll v3.2d, v3.2s, #0 -; CHECK-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-NEXT: ushll v4.2d, v1.2s, #0 -; CHECK-NEXT: ucvtf v0.2d, v3.2d -; CHECK-NEXT: ucvtf v1.2d, v5.2d -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: ucvtf v3.2d, v4.2d +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: and v2.8b, v2.8b, v1.8b +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ushll v5.2d, v2.2s, #0 +; CHECK-NEXT: ucvtf v2.2d, v3.2d +; CHECK-NEXT: ushll v1.2d, v0.2s, #0 +; CHECK-NEXT: ucvtf v0.2d, v4.2d +; CHECK-NEXT: ucvtf v1.2d, v1.2d +; CHECK-NEXT: ucvtf v3.2d, v5.2d ; CHECK-NEXT: ret %1 = uitofp <8 x i16> %a to <8 x double> ret <8 x double> %1