diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -934,6 +934,7 @@ setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); @@ -13313,6 +13314,29 @@ return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); } +static SDValue PerformExtractEltCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + EVT VT = N->getValueType(0); + SDLoc dl(N); + + // extract (vdup x) -> x + if (Op0->getOpcode() == ARMISD::VDUP) { + SDValue X = Op0->getOperand(0); + if (VT == MVT::f16 && X.getValueType() == MVT::i32) + return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); + if (VT == MVT::i32 && X.getValueType() == MVT::f16) + return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); + + while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) + X = X->getOperand(0); + if (X.getValueType() == VT) + return X; + } + + return SDValue(); +} + /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for /// ISD::VECTOR_SHUFFLE. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { @@ -15301,6 +15325,7 @@ case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); + case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll @@ -5174,116 +5174,112 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oeq_v8f16_bc: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} -; CHECK-MVE-NEXT: ldrh r0, [r0] +; CHECK-MVE-NEXT: .vsave {d8, d9, d10} +; CHECK-MVE-NEXT: vpush {d8, d9, d10} +; CHECK-MVE-NEXT: ldrh r1, [r0] ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 +; CHECK-MVE-NEXT: vmovx.f16 s14, s8 ; CHECK-MVE-NEXT: movs r2, #0 -; CHECK-MVE-NEXT: movs r1, #0 -; CHECK-MVE-NEXT: vdup.16 q4, r0 +; CHECK-MVE-NEXT: vmovx.f16 s20, s9 ; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vmovx.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s22, s17 -; CHECK-MVE-NEXT: vcmp.f16 s12, s14 +; CHECK-MVE-NEXT: vmov.f16 s16, r1 +; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s12, s16 ; CHECK-MVE-NEXT: vmovx.f16 s12, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq -; CHECK-MVE-NEXT: moveq r0, #1 -; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: moveq r1, #1 +; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s0, s16 -; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmovx.f16 s14, s8 -; CHECK-MVE-NEXT: lsls r0, r0, #31 +; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmovx.f16 s0, s3 +; CHECK-MVE-NEXT: lsls r1, r1, #31 ; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r2, #1 ; CHECK-MVE-NEXT: cmp r2, #0 ; CHECK-MVE-NEXT: cset r2, ne -; CHECK-MVE-NEXT: vmov r0, s12 +; CHECK-MVE-NEXT: vmov r1, s12 ; CHECK-MVE-NEXT: lsls r2, r2, #31 -; CHECK-MVE-NEXT: vcmp.f16 s1, s17 +; CHECK-MVE-NEXT: vcmp.f16 s1, s16 ; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: vmov r2, s12 ; CHECK-MVE-NEXT: vmov.16 q3[0], r2 -; CHECK-MVE-NEXT: vmov.16 q3[1], r0 -; CHECK-MVE-NEXT: mov.w r0, #0 -; CHECK-MVE-NEXT: it eq -; CHECK-MVE-NEXT: moveq r0, #1 -; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: lsls r0, r0, #31 -; CHECK-MVE-NEXT: vseleq.f16 s20, s9, s5 -; CHECK-MVE-NEXT: vmov r0, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s1 -; CHECK-MVE-NEXT: vcmp.f16 s20, s22 -; CHECK-MVE-NEXT: vmov.16 q3[2], r0 -; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: mov.w r0, #0 +; CHECK-MVE-NEXT: vmov.16 q3[1], r1 +; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it eq -; CHECK-MVE-NEXT: moveq r0, #1 -; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vmovx.f16 s22, s9 -; CHECK-MVE-NEXT: lsls r0, r0, #31 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 -; CHECK-MVE-NEXT: vcmp.f16 s2, s18 -; CHECK-MVE-NEXT: vmov r0, s20 +; CHECK-MVE-NEXT: moveq r1, #1 +; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: lsls r1, r1, #31 +; CHECK-MVE-NEXT: vseleq.f16 s18, s9, s5 +; CHECK-MVE-NEXT: vmov r1, s18 +; CHECK-MVE-NEXT: vmovx.f16 s18, s1 +; CHECK-MVE-NEXT: vcmp.f16 s18, s16 +; CHECK-MVE-NEXT: vmov.16 q3[2], r1 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: vmov.16 q3[3], r0 -; CHECK-MVE-NEXT: mov.w r0, #0 +; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it eq -; CHECK-MVE-NEXT: moveq r0, #1 -; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s18 -; CHECK-MVE-NEXT: lsls r0, r0, #31 -; CHECK-MVE-NEXT: vseleq.f16 s20, s10, s6 -; CHECK-MVE-NEXT: vmov r0, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s2 -; CHECK-MVE-NEXT: vcmp.f16 s20, s22 -; CHECK-MVE-NEXT: vmov.16 q3[4], r0 +; CHECK-MVE-NEXT: moveq r1, #1 +; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s18, s5 +; CHECK-MVE-NEXT: lsls r1, r1, #31 +; CHECK-MVE-NEXT: vcmp.f16 s2, s16 +; CHECK-MVE-NEXT: vseleq.f16 s18, s20, s18 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: mov.w r0, #0 +; CHECK-MVE-NEXT: vmov r1, s18 +; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vmov.16 q3[3], r1 +; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it eq -; CHECK-MVE-NEXT: moveq r0, #1 -; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vmovx.f16 s22, s10 -; CHECK-MVE-NEXT: lsls r0, r0, #31 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 -; CHECK-MVE-NEXT: vcmp.f16 s3, s19 -; CHECK-MVE-NEXT: vmov r0, s20 +; CHECK-MVE-NEXT: moveq r1, #1 +; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: lsls r1, r1, #31 +; CHECK-MVE-NEXT: vseleq.f16 s18, s10, s6 +; CHECK-MVE-NEXT: vmov r1, s18 +; CHECK-MVE-NEXT: vmovx.f16 s18, s2 +; CHECK-MVE-NEXT: vcmp.f16 s18, s16 +; CHECK-MVE-NEXT: vmov.16 q3[4], r1 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: vmov.16 q3[5], r0 -; CHECK-MVE-NEXT: mov.w r0, #0 +; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it eq -; CHECK-MVE-NEXT: moveq r0, #1 -; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmovx.f16 s2, s19 -; CHECK-MVE-NEXT: vcmp.f16 s0, s2 -; CHECK-MVE-NEXT: lsls r0, r0, #31 -; CHECK-MVE-NEXT: vseleq.f16 s20, s11, s7 +; CHECK-MVE-NEXT: moveq r1, #1 +; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s18, s6 +; CHECK-MVE-NEXT: lsls r1, r1, #31 +; CHECK-MVE-NEXT: vcmp.f16 s3, s16 +; CHECK-MVE-NEXT: vseleq.f16 s18, s20, s18 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: vmov r1, s18 +; CHECK-MVE-NEXT: vcmp.f16 s0, s16 +; CHECK-MVE-NEXT: vmov.16 q3[5], r1 +; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmov r0, s20 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.16 q3[6], r0 -; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmovx.f16 s0, s7 +; CHECK-MVE-NEXT: lsls r1, r1, #31 ; CHECK-MVE-NEXT: vmovx.f16 s2, s11 +; CHECK-MVE-NEXT: vseleq.f16 s18, s11, s7 +; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: it eq +; CHECK-MVE-NEXT: moveq r0, #1 +; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmov r1, s18 ; CHECK-MVE-NEXT: lsls r0, r0, #31 +; CHECK-MVE-NEXT: vmov.16 q3[6], r1 ; CHECK-MVE-NEXT: vseleq.f16 s0, s2, s0 ; CHECK-MVE-NEXT: vmov r0, s0 ; CHECK-MVE-NEXT: vmov.16 q3[7], r0 ; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vpop {d8, d9, d10} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16_bc: diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -240,9 +240,6 @@ define arm_aapcs_vfpcc float @vdup_f32_extract(float %src) { ; CHECK-LABEL: vdup_f32_extract: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vmov.f32 s0, s2 ; CHECK-NEXT: bx lr entry: %srcbc = bitcast float %src to i32 @@ -260,8 +257,8 @@ ; CHECK-NEXT: vldr.16 s2, [r1] ; CHECK-NEXT: vadd.f16 s0, s2, s0 ; CHECK-NEXT: vmov.f16 r1, s0 -; CHECK-NEXT: vdup.16 q0, r1 -; CHECK-NEXT: vstr.16 s1, [r0] +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: bx lr entry: %0 = load half, half *%src1, align 2