Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -934,6 +934,7 @@ setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); @@ -13266,6 +13267,29 @@ return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); } +static SDValue PerformExtractEltCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + EVT VT = N->getValueType(0); + SDLoc dl(N); + + // extract (vdup x) -> x + if (Op0->getOpcode() == ARMISD::VDUP) { + SDValue X = Op0->getOperand(0); + if (VT == MVT::f16 && X.getValueType() == MVT::i32) + return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); + if (VT == MVT::i32 && X.getValueType() == MVT::f16) + return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); + + while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) + X = X->getOperand(0); + if (X.getValueType() == VT) + return X; + } + + return SDValue(); +} + /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for /// ISD::VECTOR_SHUFFLE. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { @@ -15253,6 +15277,7 @@ case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); + case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); Index: llvm/test/CodeGen/Thumb2/mve-vdup.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -240,9 +240,6 @@ define arm_aapcs_vfpcc float @vdup_f32_extract(float %src) { ; CHECK-LABEL: vdup_f32_extract: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s0, s2 ; CHECK-NEXT: bx lr entry: %srcbc = bitcast float %src to i32 @@ -260,8 +257,8 @@ ; CHECK-NEXT: vldr.16 s2, [r1] ; CHECK-NEXT: vadd.f16 s0, s2, s0 ; CHECK-NEXT: vmov.f16 r1, s0 -; CHECK-NEXT: vdup.16 q0, r1 -; CHECK-NEXT: vstr.16 s1, [r0] +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr entry: %0 = load half, half *%src1, align 2