Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -15197,8 +15197,17 @@ return Res; } -static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { SDValue Src = N->getOperand(0); + EVT DstVT = N->getValueType(0); + + // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE. + if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) { + EVT SrcVT = Src.getValueType(); + if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits()) + return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0)); + } // We may have a bitcast of something that has already had this bitcast // combine performed on it, so skip past any VECTOR_REG_CASTs. @@ -15208,7 +15217,6 @@ // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that // would be generated is at least the width of the element type. EVT SrcVT = Src.getValueType(); - EVT DstVT = N->getValueType(0); if ((Src.getOpcode() == ARMISD::VMOVIMM || Src.getOpcode() == ARMISD::VMVNIMM || Src.getOpcode() == ARMISD::VMOVFPIMM) && @@ -15273,7 +15281,7 @@ case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); case ISD::BITCAST: - return PerformBITCASTCombine(N, DCI.DAG); + return PerformBITCASTCombine(N, DCI.DAG, Subtarget); case ARMISD::PREDICATE_CAST: return PerformPREDICATE_CASTCombine(N, DCI); case ARMISD::VECTOR_REG_CAST: Index: llvm/test/CodeGen/Thumb2/mve-vaddqr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vaddqr.ll +++ llvm/test/CodeGen/Thumb2/mve-vaddqr.ll @@ -131,8 +131,7 @@ ; CHECK-LABEL: vaddqr_v4f32_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vadd.f32 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %src2bc = bitcast float %src2 to i32 @@ -147,8 +146,7 @@ ; CHECK-LABEL: vaddqr_v8f16_3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vdup.16 q1, r0 -; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vadd.f16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %src2 = load half, half *%src2p, align 2 @@ -164,8 +162,7 @@ ; CHECK-LABEL: vaddqr_v4f32_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vdup.32 q1, r0 -; CHECK-NEXT: vadd.f32 q0, q1, q0 +; CHECK-NEXT: vadd.f32 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %src2bc = bitcast float %src2 to i32 @@ -180,8 +177,7 @@ ; CHECK-LABEL: vaddqr_v8f16_4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vdup.16 q1, r0 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vadd.f16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: %src2 = load half, half *%src2p, align 2