Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7086,19 +7086,62 @@ // Otherwise, duplicate from the lane of the input vector. unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); - // SelectionDAGBuilder may have "helpfully" already extracted or conatenated - // to make a vector of the same size as this SHUFFLE. We can ignore the - // extract entirely, and canonicalise the concat using WidenVector. - if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { - Lane += cast(V1.getOperand(1))->getZExtValue(); + // Try to eliminate a bitcasted extract subvector before a DUPLANE. + auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { + // Match: dup (bitcast (extract_subv X, C)), LaneC + if (BitCast.getOpcode() != ISD::BITCAST || + BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) + return false; + + // The extract index must align in the destination type. That may not + // happen if the bitcast is from narrow to wide type. + SDValue Extract = BitCast.getOperand(0); + unsigned ExtIdx = Extract.getConstantOperandVal(1); + unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); + unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; + unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); + if (ExtIdxInBits % CastedEltBitWidth != 0) + return false; + + // Update the lane value by offsetting with the scaled extract index. + LaneC += ExtIdxInBits / CastedEltBitWidth; + + // Determine the casted vector type of the wide vector input. + unsigned SrcVecNumElts = + Extract.getOperand(0).getValueType().getVectorNumElements(); + if (CastedEltBitWidth > SrcEltBitWidth) { + // If casting to wider element type, divide number of elements. + SrcVecNumElts /= CastedEltBitWidth / SrcEltBitWidth; + } else { + // If casting to narrower element type, multiply number of elements. + SrcVecNumElts *= SrcEltBitWidth / CastedEltBitWidth; + } + // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' + // Examples: + // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 + // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 + CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), + SrcVecNumElts); + return true; + }; + MVT CastVT; + if (getScaledOffsetDup(V1, Lane, CastVT)) { + V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0)); + } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + // The lane is incremented by the index of the extract. + // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 + Lane += V1.getConstantOperandVal(1); V1 = V1.getOperand(0); } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { + // The lane is decremented if we are splatting from the 2nd operand. + // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; Lane -= Idx * VT.getVectorNumElements() / 2; V1 = WidenVector(V1.getOperand(Idx), DAG); - } else if (VT.getSizeInBits() == 64) + } else if (VT.getSizeInBits() == 64) { + // Widen the operand to 128-bit register with undef. V1 = WidenVector(V1, DAG); - + } return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); } Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -1663,8 +1663,7 @@ define <2 x float> @test_vmul_laneq3_f32_bitcast(<2 x float> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq3_f32_bitcast: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[1] +; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[3] ; CHECK-NEXT: ret %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> %bc = bitcast <1 x double> %extract to <2 x float> @@ -1676,8 +1675,7 @@ define <2 x float> @test_vmul_laneq2_f32_bitcast(<2 x float> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq2_f32_bitcast: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[0] +; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[2] ; CHECK-NEXT: ret %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> %bc = bitcast <1 x double> %extract to <2 x float> @@ -1689,8 +1687,7 @@ define <4 x i16> @test_vadd_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) { ; CHECK-LABEL: test_vadd_laneq5_i16_bitcast: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: dup v1.4h, v1.h[1] +; CHECK-NEXT: dup v1.4h, v1.h[5] ; CHECK-NEXT: add v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> @@ -1700,6 +1697,8 @@ ret <4 x i16> %r } +; TODO: The pattern in LowerVECTOR_SHUFFLE does not match what we are looking for. + define <4 x i16> @test_vadd_lane2_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) { ; CHECK-LABEL: test_vadd_lane2_i16_bitcast_bigger_aligned: ; CHECK: // %bb.0: @@ -1717,8 +1716,7 @@ define <4 x i16> @test_vadd_lane5_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) { ; CHECK-LABEL: test_vadd_lane5_i16_bitcast_bigger_aligned: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: dup v1.4h, v1.h[1] +; CHECK-NEXT: dup v1.4h, v1.h[5] ; CHECK-NEXT: add v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %extract = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> @@ -1728,6 +1726,8 @@ ret <4 x i16> %r } +; Negative test - can't dup bytes {3,4} of v8i16. + define <4 x i16> @test_vadd_lane_i16_bitcast_bigger_unaligned(<4 x i16> %a, <16 x i8> %v) { ; CHECK-LABEL: test_vadd_lane_i16_bitcast_bigger_unaligned: ; CHECK: // %bb.0: