Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14064,7 +14064,8 @@ } static SDValue PerformExtractEltCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { SDValue Op0 = N->getOperand(0); EVT VT = N->getValueType(0); SDLoc dl(N); @@ -14083,6 +14084,19 @@ return X; } + // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b + if (Op0.getValueType() == MVT::v4i32 && + isa(N->getOperand(1)) && + Op0.getOpcode() == ISD::BITCAST && + Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && + Op0.getOperand(0).getValueType() == MVT::v2f64) { + SDValue BV = Op0.getOperand(0); + unsigned Offset = N->getConstantOperandVal(1); + SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1); + if (MOV.getOpcode() == ARMISD::VMOVDRR) + return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2); + } + return SDValue(); } @@ -16502,7 +16516,8 @@ case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); - case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI); + case ISD::EXTRACT_VECTOR_ELT: + return PerformExtractEltCombine(N, DCI, Subtarget); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget); case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); Index: llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll +++ llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll @@ -51,27 +51,21 @@ } define <2 x i64> @vector_add_i64(<2 x i64> %lhs, <2 x i64> %rhs) { -; CHECK-FP-LABEL: vector_add_i64: -; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: .save {r7, lr} -; CHECK-FP-NEXT: push {r7, lr} -; CHECK-FP-NEXT: vmov d1, r2, r3 -; CHECK-FP-NEXT: vmov d0, r0, r1 -; CHECK-FP-NEXT: add r0, sp, #8 -; CHECK-FP-NEXT: vldrw.u32 q1, [r0] -; CHECK-FP-NEXT: vmov lr, s2 -; CHECK-FP-NEXT: vmov r0, s0 -; CHECK-FP-NEXT: vmov r3, s4 -; CHECK-FP-NEXT: vmov r1, s1 -; CHECK-FP-NEXT: vmov r2, s5 -; CHECK-FP-NEXT: vmov r12, s3 -; CHECK-FP-NEXT: adds r0, r0, r3 -; CHECK-FP-NEXT: vmov r3, s7 -; CHECK-FP-NEXT: adcs r1, r2 -; CHECK-FP-NEXT: vmov r2, s6 -; CHECK-FP-NEXT: adds.w r2, r2, lr -; CHECK-FP-NEXT: adc.w r3, r3, r12 -; CHECK-FP-NEXT: pop {r7, pc} +; CHECK-LABEL: vector_add_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: add.w r12, sp, #8 +; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: vmov lr, s0 +; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: vmov lr, s2 +; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: vmov r12, s3 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: pop {r7, pc} entry: %sum = add <2 x i64> %lhs, %rhs ret <2 x i64> %sum