Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7043,7 +7043,22 @@ // SelectionDAGBuilder may have "helpfully" already extracted or conatenated // to make a vector of the same size as this SHUFFLE. We can ignore the // extract entirely, and canonicalise the concat using WidenVector. - if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + unsigned VTEltBitWidth = VT.getScalarSizeInBits(); + if (V1.getOpcode() == ISD::BITCAST && + V1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && + V1.getOperand(0).getScalarValueSizeInBits() % VTEltBitWidth == 0) { + // If the extract is bitcast to smaller type, offset the DUPLANE index to + // account for that and bitcast the DUPLANE operand. + SDValue SrcOp = V1.getOperand(0); + unsigned ExtIdx = SrcOp.getConstantOperandVal(1); + unsigned Scale = SrcOp.getScalarValueSizeInBits() / VTEltBitWidth; + Lane += ExtIdx * Scale; + unsigned WideVecNumElts = + SrcOp.getOperand(0).getValueType().getVectorNumElements(); + MVT CastVT = MVT::getVectorVT(VT.getSimpleVT().getScalarType(), + WideVecNumElts * Scale); + V1 = DAG.getBitcast(CastVT, SrcOp.getOperand(0)); + } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { Lane += cast(V1.getOperand(1))->getZExtValue(); V1 = V1.getOperand(0); } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -1663,8 +1663,7 @@ define <2 x float> @test_vmul_laneq3_f32_bitcast(<2 x float> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq3_f32_bitcast: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[1] +; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[3] ; CHECK-NEXT: ret %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> %bc = bitcast <1 x double> %extract to <2 x float> @@ -1676,8 +1675,7 @@ define <2 x float> @test_vmul_laneq2_f32_bitcast(<2 x float> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq2_f32_bitcast: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[0] +; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[2] ; CHECK-NEXT: ret %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> %bc = bitcast <1 x double> %extract to <2 x float> @@ -1689,8 +1687,7 @@ define <4 x i16> @test_vmul_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq5_i16_bitcast: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: dup v1.4h, v1.h[1] +; CHECK-NEXT: dup v1.4h, v1.h[5] ; CHECK-NEXT: add v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32>