Index: llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp =================================================================== --- llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -1860,6 +1860,66 @@ default: return false; + case ARM::VBSPd: + case ARM::VBSPq: { + Register DstReg = MI.getOperand(0).getReg(); + if (DstReg == MI.getOperand(3).getReg()) { + // Expand to VBIT + unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBITd : ARM::VBITq; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) + .add(MI.getOperand(0)) + .add(MI.getOperand(3)) + .add(MI.getOperand(2)) + .add(MI.getOperand(1)) + .addImm(MI.getOperand(4).getImm()) + .add(MI.getOperand(5)); + } else if (DstReg == MI.getOperand(2).getReg()) { + // Expand to VBIF + unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBIFd : ARM::VBIFq; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) + .add(MI.getOperand(0)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(1)) + .addImm(MI.getOperand(4).getImm()) + .add(MI.getOperand(5)); + } else { + // Expand to VBSL + unsigned NewOpc = Opcode == ARM::VBSPd ? ARM::VBSLd : ARM::VBSLq; + if (DstReg == MI.getOperand(1).getReg()) { + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .addImm(MI.getOperand(4).getImm()) + .add(MI.getOperand(5)); + } else { + // Use move to satisfy constraints + unsigned MoveOpc = Opcode == ARM::VBSPd ? ARM::VORRd : ARM::VORRq; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MoveOpc)) + .addReg(DstReg, + RegState::Define | + getRenamableRegState(MI.getOperand(0).isRenamable())) + .add(MI.getOperand(1)) + .add(MI.getOperand(1)) + .addImm(MI.getOperand(4).getImm()) + .add(MI.getOperand(5)); + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)) + .add(MI.getOperand(0)) + .addReg(DstReg, + RegState::Kill | + getRenamableRegState(MI.getOperand(0).isRenamable())) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .addImm(MI.getOperand(4).getImm()) + .add(MI.getOperand(5)); + } + } + MI.eraseFromParent(); + return true; + } + case ARM::TCRETURNdi: case ARM::TCRETURNri: { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -271,8 +271,8 @@ // Vector AND with NOT of immediate VBICIMM, - // Vector bitwise select - VBSL, + // Pseudo vector bitwise select + VBSP, // Pseudo-instruction representing a memory copy using ldm/stm // instructions. Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1755,7 +1755,7 @@ case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; - case ARMISD::VBSL: return "ARMISD::VBSL"; + case ARMISD::VBSP: return "ARMISD::VBSP"; case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; @@ -13153,7 +13153,7 @@ // Canonicalize the vector type to make instruction selection // simpler. EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; - SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, + SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT, N0->getOperand(1), N0->getOperand(0), N1->getOperand(0)); Index: llvm/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrNEON.td +++ llvm/lib/Target/ARM/ARMInstrNEON.td @@ -509,7 +509,7 @@ def NEONvsliImm : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>; def NEONvsriImm : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>; -def NEONvbsl : SDNode<"ARMISD::VBSL", +def NEONvbsp : SDNode<"ARMISD::VBSP", SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, @@ -4526,9 +4526,9 @@ (SubReg_i16_lane imm:$lane)))>; def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1), - (v4i32 (int_arm_neon_vqrdmulh + (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src2), - (v4i32 (ARMvduplane (v4i32 QPR:$src3), + (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane)))))), (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), @@ -4579,17 +4579,17 @@ (v2i32 DPR:$Vn), (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm), imm:$lane)))))), - (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, + (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane))>; def : Pat<(v8i16 (ssubsat (v8i16 QPR:$src1), (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src2), - (v8i16 (ARMvduplane (v8i16 QPR:$src3), + (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane)))))), (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2), - (v4i16 (EXTRACT_SUBREG + (v4i16 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i16_reg imm:$lane))), (SubReg_i16_lane imm:$lane)))>; @@ -4601,7 +4601,7 @@ imm:$lane)))))), (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2), - (v2i32 (EXTRACT_SUBREG + (v2i32 (EXTRACT_SUBREG QPR:$src3, (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; @@ -5442,74 +5442,86 @@ def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>; } -// VBSL : Vector Bitwise Select -def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), - (ins DPR:$src1, DPR:$Vn, DPR:$Vm), - N3RegFrm, IIC_VCNTiD, - "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", - [(set DPR:$Vd, - (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; +// The TwoAddress pass will not go looking for equivalent operations +// with different register constraints; it just inserts copies. +// That is why pseudo VBSP implemented. Is is expanded later into +// VBIT/VBIF/VBSL taking into account register constraints to avoid copies. +def VBSPd + : PseudoNeonI<(outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), + IIC_VBINiD, "", + [(set DPR:$Vd, + (v2i32 (NEONvbsp DPR:$src1, DPR:$Vn, DPR:$Vm)))]>; let Predicates = [HasNEON] in { def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1), (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))), - (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1), (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))), - (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1), (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))), - (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))), - (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1), (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))), - (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$src1, DPR:$Vn, DPR:$Vm)>; def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd), (and DPR:$Vm, (vnotd DPR:$Vd)))), - (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd), (and DPR:$Vm, (vnotd DPR:$Vd)))), - (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; + (VBSPd DPR:$Vd, DPR:$Vn, DPR:$Vm)>; } -def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), - (ins QPR:$src1, QPR:$Vn, QPR:$Vm), - N3RegFrm, IIC_VCNTiQ, - "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", - [(set QPR:$Vd, - (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; - +def VBSPq + : PseudoNeonI<(outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), + IIC_VBINiQ, "", + [(set QPR:$Vd, + (v4i32 (NEONvbsp QPR:$src1, QPR:$Vn, QPR:$Vm)))]>; let Predicates = [HasNEON] in { def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1), (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))), - (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1), (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))), - (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1), (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))), - (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))), - (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1), (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))), - (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$src1, QPR:$Vn, QPR:$Vm)>; def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd), (and QPR:$Vm, (vnotq QPR:$Vd)))), - (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd), (and QPR:$Vm, (vnotq QPR:$Vd)))), - (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; + (VBSPq QPR:$Vd, QPR:$Vn, QPR:$Vm)>; } +// VBSL : Vector Bitwise Select +def VBSLd : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd), + (ins DPR:$src1, DPR:$Vn, DPR:$Vm), + N3RegFrm, IIC_VBINiD, + "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", + []>; + +def VBSLq : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd), + (ins QPR:$src1, QPR:$Vn, QPR:$Vm), + N3RegFrm, IIC_VBINiQ, + "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd", + []>; + // VBIF : Vector Bitwise Insert if False // like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst", -// FIXME: This instruction's encoding MAY NOT BE correct. def VBIFd : N3VX<1, 0, 0b11, 0b0001, 0, 1, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, @@ -5523,7 +5535,6 @@ // VBIT : Vector Bitwise Insert if True // like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst", -// FIXME: This instruction's encoding MAY NOT BE correct. def VBITd : N3VX<1, 0, 0b10, 0b0001, 0, 1, (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD, @@ -5535,10 +5546,6 @@ "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd", []>; -// VBIT/VBIF are not yet implemented. The TwoAddress pass will not go looking -// for equivalent operations with different register constraints; it just -// inserts copies. - // Vector Absolute Differences. // VABD : Vector Absolute Difference @@ -7953,7 +7960,7 @@ (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>; } -// The following patterns are basically a copy of the patterns above, +// The following patterns are basically a copy of the patterns above, // however with an additional VREV16d instruction to convert data // loaded by VLD1LN into proper vector format in big endian mode. let Predicates = [HasNEON,IsBE] in { Index: llvm/lib/Target/ARM/ARMScheduleA57.td =================================================================== --- llvm/lib/Target/ARM/ARMScheduleA57.td +++ llvm/lib/Target/ARM/ARMScheduleA57.td @@ -1201,7 +1201,7 @@ // --- 3.16 ASIMD Miscellaneous Instructions --- // ASIMD bitwise insert -def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>; +def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL", "VBSP")>; // ASIMD count def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>; Index: llvm/lib/Target/ARM/ARMScheduleR52.td =================================================================== --- llvm/lib/Target/ARM/ARMScheduleR52.td +++ llvm/lib/Target/ARM/ARMScheduleR52.td @@ -787,8 +787,8 @@ def : InstRW<[R52WriteFPALU_F3, R52Read_F2], (instregex "VBICi(v4i16|v2i32)")>; def : InstRW<[R52Write2FPALU_F3, R52Read_F2], (instregex "VBICi(v8i16|v4i32)")>; -def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)d")>; -def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL)q")>; +def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL|VBSP)d")>; +def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F2, R52Read_F2], (instregex "(VBIF|VBIT|VBSL|VBSP)q")>; def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VCEQ|VCGE|VCGT|VCLE|VCLT|VCLZ|VCMP|VCMPE|VCNT)")>; Index: llvm/lib/Target/ARM/ARMScheduleSwift.td =================================================================== --- llvm/lib/Target/ARM/ARMScheduleSwift.td +++ llvm/lib/Target/ARM/ARMScheduleSwift.td @@ -558,8 +558,8 @@ (instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL", "VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi", "VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST", - "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL(s|u)", "VBIF", - "VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>; + "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL(s|u)", "VBIF", "VBIT", + "VBSL", "VBSP", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>; def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VEXT", "VREV16", "VREV32", "VREV64")>; Index: llvm/test/CodeGen/ARM/fcopysign.ll =================================================================== --- llvm/test/CodeGen/ARM/fcopysign.ll +++ llvm/test/CodeGen/ARM/fcopysign.ll @@ -12,11 +12,11 @@ ; ; HARD-LABEL: test1: ; HARD: @ %bb.0: @ %entry -; HARD-NEXT: vmov.f32 s4, s1 +; HARD-NEXT: vmov.f32 s2, s1 ; HARD-NEXT: @ kill: def $s0 killed $s0 def $d0 -; HARD-NEXT: vmov.i32 d1, #0x80000000 -; HARD-NEXT: vbsl d1, d2, d0 -; HARD-NEXT: vmov.f32 s0, s2 +; HARD-NEXT: vmov.i32 d16, #0x80000000 +; HARD-NEXT: vbit d0, d1, d16 +; HARD-NEXT: @ kill: def $s0 killed $s0 killed $d0 ; HARD-NEXT: bx lr entry: @@ -35,8 +35,7 @@ ; HARD: @ %bb.0: @ %entry ; HARD-NEXT: vmov.i32 d16, #0x80000000 ; HARD-NEXT: vshl.i64 d16, d16, #32 -; HARD-NEXT: vbsl d16, d1, d0 -; HARD-NEXT: vorr d0, d16, d16 +; HARD-NEXT: vbit d0, d1, d16 ; HARD-NEXT: bx lr entry: @@ -53,15 +52,16 @@ ; SOFT-NEXT: vmov.i32 d17, #0x80000000 ; SOFT-NEXT: vshl.i64 d17, d17, #32 ; SOFT-NEXT: vldr d18, [sp] -; SOFT-NEXT: vbsl d17, d18, d16 -; SOFT-NEXT: vmov r0, r1, d17 +; SOFT-NEXT: vbit d16, d18, d17 +; SOFT-NEXT: vmov r0, r1, d16 ; SOFT-NEXT: bx lr ; ; HARD-LABEL: test3: ; HARD: @ %bb.0: @ %entry ; HARD-NEXT: vmul.f64 d16, d0, d1 ; HARD-NEXT: vmov.i32 d17, #0x80000000 -; HARD-NEXT: vshl.i64 d0, d17, #32 +; HARD-NEXT: vshl.i64 d17, d17, #32 +; HARD-NEXT: vorr d0, d17, d17 ; HARD-NEXT: vbsl d0, d2, d16 ; HARD-NEXT: bx lr entry: @@ -81,8 +81,8 @@ ; SOFT-NEXT: vmov.i32 d17, #0x80000000 ; SOFT-NEXT: vshr.u64 d16, d16, #32 ; SOFT-NEXT: vmov.f32 d18, #5.000000e-01 -; SOFT-NEXT: vbsl d17, d16, d18 -; SOFT-NEXT: vadd.f32 d0, d0, d17 +; SOFT-NEXT: vbif d16, d18, d17 +; SOFT-NEXT: vadd.f32 d0, d0, d16 ; SOFT-NEXT: vmov r0, s0 ; SOFT-NEXT: pop {lr} ; @@ -93,10 +93,10 @@ ; HARD-NEXT: bl bar ; HARD-NEXT: vmov d16, r0, r1 ; HARD-NEXT: vcvt.f32.f64 s0, d16 -; HARD-NEXT: vmov.i32 d1, #0x80000000 +; HARD-NEXT: vmov.i32 d17, #0x80000000 ; HARD-NEXT: vshr.u64 d16, d16, #32 -; HARD-NEXT: vmov.f32 s4, #5.000000e-01 -; HARD-NEXT: vbsl d1, d16, d2 +; HARD-NEXT: vmov.f32 s2, #5.000000e-01 +; HARD-NEXT: vbit d1, d16, d17 ; HARD-NEXT: vadd.f32 s0, s0, s2 ; HARD-NEXT: pop {r11, pc} entry: Index: llvm/test/CodeGen/ARM/fp16-promote.ll =================================================================== --- llvm/test/CodeGen/ARM/fp16-promote.ll +++ llvm/test/CodeGen/ARM/fp16-promote.ll @@ -701,13 +701,13 @@ ; CHECK-FP16-LABEL: test_copysign: ; CHECK-FP16: ldrh r2, [r0] -; CHECK-FP16-NEXT: vmov.i32 d0, #0x80000000 +; CHECK-FP16-NEXT: vmov.i32 d16, #0x80000000 ; CHECK-FP16-NEXT: ldrh r1, [r1] -; CHECK-FP16-NEXT: vmov s2, r2 -; CHECK-FP16-NEXT: vmov s4, r1 +; CHECK-FP16-NEXT: vmov s0, r2 +; CHECK-FP16-NEXT: vmov s2, r1 +; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-FP16-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-FP16-NEXT: vcvtb.f32.f16 s4, s4 -; CHECK-FP16-NEXT: vbsl d0, d2, d1 +; CHECK-FP16-NEXT: vbit d0, d1, d16 ; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-FP16-NEXT: vmov r1, s0 ; CHECK-FP16-NEXT: strh r1, [r0] @@ -729,8 +729,8 @@ ; CHECK-LIBCALL-VFP-NEXT: mov r0, r1 ; CHECK-LIBCALL: bl __aeabi_h2f ; CHECK-LIBCALL-VFP: vmov s0, r0 -; CHECK-LIBCALL-VFP-NEXT: vbsl d8, d0, d9 -; CHECK-LIBCALL-VFP-NEXT: vmov r0, s16 +; CHECK-LIBCALL-VFP-NEXT: vbif d0, d9, d8 +; CHECK-LIBCALL-VFP-NEXT: vmov r0, s0 ; CHECK-LIBCALL: bl __aeabi_f2h ; CHECK-LIBCALL-VFP: strh r0, [r5] ; CHECK-LIBCALL-VFP-NEXT: vpop {d8, d9} Index: llvm/test/CodeGen/ARM/vbsl-constant.ll =================================================================== --- llvm/test/CodeGen/ARM/vbsl-constant.ll +++ llvm/test/CodeGen/ARM/vbsl-constant.ll @@ -79,9 +79,9 @@ ; CHECK-NEXT: vld1.32 {d16, d17}, [r2] ; CHECK-NEXT: vmov.i8 q9, #0x3 ; CHECK-NEXT: vld1.32 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q9, q10, q8 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vbit q8, q10, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B @@ -98,9 +98,9 @@ ; CHECK-NEXT: vld1.32 {d16, d17}, [r2] ; CHECK-NEXT: vmov.i16 q9, #0x3 ; CHECK-NEXT: vld1.32 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q9, q10, q8 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vbit q8, q10, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -117,9 +117,9 @@ ; CHECK-NEXT: vld1.32 {d16, d17}, [r2] ; CHECK-NEXT: vmov.i32 q9, #0x3 ; CHECK-NEXT: vld1.32 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q9, q10, q8 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vbit q8, q10, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B @@ -137,9 +137,9 @@ ; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: adr r0, LCPI7_0 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128] -; CHECK-NEXT: vbsl q10, q9, q8 -; CHECK-NEXT: vmov r0, r1, d20 -; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: vbit q8, q9, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B Index: llvm/test/CodeGen/ARM/vbsl.ll =================================================================== --- llvm/test/CodeGen/ARM/vbsl.ll +++ llvm/test/CodeGen/ARM/vbsl.ll @@ -6,11 +6,11 @@ define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { ; CHECK-LABEL: v_bsli8: ; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d18, [r0] ; CHECK-NEXT: vldr d16, [r2] ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d18, [r0] -; CHECK-NEXT: vbsl d18, d17, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vbit d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B @@ -25,11 +25,11 @@ define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { ; CHECK-LABEL: v_bsli16: ; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d18, [r0] ; CHECK-NEXT: vldr d16, [r2] ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d18, [r0] -; CHECK-NEXT: vbsl d18, d17, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vbit d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -44,11 +44,11 @@ define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { ; CHECK-LABEL: v_bsli32: ; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d18, [r0] ; CHECK-NEXT: vldr d16, [r2] ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d18, [r0] -; CHECK-NEXT: vbsl d18, d17, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vbit d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -63,11 +63,11 @@ define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind { ; CHECK-LABEL: v_bsli64: ; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d18, [r0] ; CHECK-NEXT: vldr d16, [r2] ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d18, [r0] -; CHECK-NEXT: vbsl d18, d17, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vbit d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B @@ -82,12 +82,12 @@ define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { ; CHECK-LABEL: v_bslQi8: ; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q10, q9, q8 -; CHECK-NEXT: vmov r0, r1, d20 -; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: vbit q8, q9, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B @@ -102,12 +102,12 @@ define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { ; CHECK-LABEL: v_bslQi16: ; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q10, q9, q8 -; CHECK-NEXT: vmov r0, r1, d20 -; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: vbit q8, q9, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B @@ -122,12 +122,12 @@ define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: v_bslQi32: ; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q10, q9, q8 -; CHECK-NEXT: vmov r0, r1, d20 -; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: vbit q8, q9, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B @@ -142,12 +142,12 @@ define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: v_bslQi64: ; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q10, q9, q8 -; CHECK-NEXT: vmov r0, r1, d20 -; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: vbit q8, q9, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i64>, <2 x i64>* %B @@ -165,8 +165,8 @@ ; CHECK-NEXT: vldr d16, [sp] ; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbsl d18, d17, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vbit d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind ret <8 x i8> %vbsl.i @@ -178,8 +178,8 @@ ; CHECK-NEXT: vldr d16, [sp] ; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbsl d18, d17, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vbit d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind ret <4 x i16> %vbsl3.i @@ -191,8 +191,8 @@ ; CHECK-NEXT: vldr d16, [sp] ; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbsl d18, d17, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vbit d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind ret <2 x i32> %vbsl3.i @@ -204,8 +204,8 @@ ; CHECK-NEXT: vldr d16, [sp] ; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbsl d18, d17, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vbit d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %vbsl4.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind ret <2 x float> %vbsl4.i @@ -214,15 +214,15 @@ define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: g1: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q9, q10, q8 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vbit q8, q10, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind ret <16 x i8> %vbsl.i @@ -231,15 +231,15 @@ define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: g2: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q9, q10, q8 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vbit q8, q10, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind ret <8 x i16> %vbsl3.i @@ -248,15 +248,15 @@ define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: g3: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q9, q10, q8 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vbit q8, q10, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind ret <4 x i32> %vbsl3.i @@ -265,15 +265,15 @@ define <4 x float> @g4(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: g4: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q9, q10, q8 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vbit q8, q10, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %vbsl4.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind ret <4 x float> %vbsl4.i @@ -285,8 +285,8 @@ ; CHECK-NEXT: vldr d16, [sp] ; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbsl d18, d17, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vbit d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind ret <1 x i64> %vbsl3.i @@ -298,8 +298,8 @@ ; CHECK-NEXT: vldr d16, [sp] ; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbsl d18, d17, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vbit d16, d17, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind ret <1 x i64> %vbsl3.i @@ -308,15 +308,15 @@ define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: test_vbslq_s64: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q9, q10, q8 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vbit q8, q10, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind ret <2 x i64> %vbsl3.i @@ -325,15 +325,15 @@ define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: test_vbslq_u64: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: add r12, sp, #16 ; CHECK-NEXT: vmov d18, r0, r1 ; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbsl q9, q10, q8 -; CHECK-NEXT: vmov r0, r1, d18 -; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vbit q8, q10, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind ret <2 x i64> %vbsl3.i Index: llvm/test/CodeGen/ARM/vselect_imax.ll =================================================================== --- llvm/test/CodeGen/ARM/vselect_imax.ll +++ llvm/test/CodeGen/ARM/vselect_imax.ll @@ -70,20 +70,20 @@ ; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: vld1.64 {d22, d23}, [r0:128]! -; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128]! -; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128] +; CHECK-NEXT: vld1.64 {d16, d17}, [r1:128]! +; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128] ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128] -; CHECK-NEXT: vmov.32 r12, d16[0] +; CHECK-NEXT: vmov.32 r12, d18[0] ; CHECK-NEXT: vmov.32 r2, d20[0] -; CHECK-NEXT: vmov.32 lr, d16[1] +; CHECK-NEXT: vmov.32 lr, d18[1] ; CHECK-NEXT: vmov.32 r0, d20[1] -; CHECK-NEXT: vmov.32 r7, d18[0] +; CHECK-NEXT: vmov.32 r7, d16[0] ; CHECK-NEXT: vmov.32 r5, d22[0] ; CHECK-NEXT: vmov.32 r4, d22[1] -; CHECK-NEXT: vmov.32 r6, d17[0] +; CHECK-NEXT: vmov.32 r6, d19[0] ; CHECK-NEXT: subs r2, r2, r12 -; CHECK-NEXT: vmov.32 r2, d18[1] +; CHECK-NEXT: vmov.32 r2, d16[1] ; CHECK-NEXT: sbcs r0, r0, lr ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 @@ -91,7 +91,7 @@ ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: subs r7, r5, r7 ; CHECK-NEXT: vmov.32 r7, d21[0] -; CHECK-NEXT: vmov.32 r5, d17[1] +; CHECK-NEXT: vmov.32 r5, d19[1] ; CHECK-NEXT: sbcs r2, r4, r2 ; CHECK-NEXT: vmov.32 r4, d21[1] ; CHECK-NEXT: mov r2, #0 @@ -100,11 +100,11 @@ ; CHECK-NEXT: mvnne r2, #0 ; CHECK-NEXT: subs r7, r7, r6 ; CHECK-NEXT: vmov.32 r6, d23[0] -; CHECK-NEXT: vmov.32 r7, d19[0] +; CHECK-NEXT: vmov.32 r7, d17[0] ; CHECK-NEXT: sbcs r5, r4, r5 ; CHECK-NEXT: mov r4, #0 ; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: vmov.32 r5, d19[1] +; CHECK-NEXT: vmov.32 r5, d17[1] ; CHECK-NEXT: subs r7, r6, r7 ; CHECK-NEXT: vmov.32 r7, d23[1] ; CHECK-NEXT: sbcs r7, r7, r5 @@ -116,11 +116,11 @@ ; CHECK-NEXT: mvnne r4, #0 ; CHECK-NEXT: vdup.32 d24, r2 ; CHECK-NEXT: vdup.32 d27, r4 -; CHECK-NEXT: vbsl q12, q11, q9 +; CHECK-NEXT: vbit q8, q11, q12 ; CHECK-NEXT: vdup.32 d26, r0 -; CHECK-NEXT: vbsl q13, q10, q8 -; CHECK-NEXT: vst1.64 {d24, d25}, [r3:128]! -; CHECK-NEXT: vst1.64 {d26, d27}, [r3:128] +; CHECK-NEXT: vbit q9, q10, q13 +; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128]! +; CHECK-NEXT: vst1.64 {d18, d19}, [r3:128] ; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov pc, lr %v0 = load %T0_18, %T0_18* %loadaddr @@ -138,121 +138,119 @@ %T1_19* %blend, %T0_19* %storeaddr) { ; CHECK-LABEL: func_blend19: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vld1.64 {d24, d25}, [r12:128]! -; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: mov lr, #0 -; CHECK-NEXT: vld1.64 {d28, d29}, [r2:128]! -; CHECK-NEXT: vld1.64 {d16, d17}, [r12:128] +; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: add r2, r1, #48 +; CHECK-NEXT: add r5, r1, #32 +; CHECK-NEXT: vld1.64 {d16, d17}, [r2:128] +; CHECK-NEXT: add r2, r0, #48 +; CHECK-NEXT: add r6, r0, #32 +; CHECK-NEXT: mov r7, #0 ; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128] -; CHECK-NEXT: add r2, r1, #32 -; CHECK-NEXT: add r1, r1, #48 -; CHECK-NEXT: vld1.64 {d20, d21}, [r2:128] -; CHECK-NEXT: add r2, r0, #32 -; CHECK-NEXT: add r0, r0, #48 -; CHECK-NEXT: vld1.64 {d30, d31}, [r2:128] -; CHECK-NEXT: vmov.32 r4, d16[0] +; CHECK-NEXT: vmov.32 r12, d16[0] ; CHECK-NEXT: vmov.32 r2, d18[0] -; CHECK-NEXT: vmov.32 r12, d16[1] -; CHECK-NEXT: vmov.32 r5, d18[1] -; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128] -; CHECK-NEXT: vmov.32 r1, d21[0] -; CHECK-NEXT: vld1.64 {d26, d27}, [r0:128] -; CHECK-NEXT: vmov.32 r0, d21[1] -; CHECK-NEXT: subs r2, r2, r4 -; CHECK-NEXT: vmov.32 r4, d31[1] -; CHECK-NEXT: vmov.32 r2, d31[0] -; CHECK-NEXT: sbcs r5, r5, r12 +; CHECK-NEXT: vmov.32 lr, d16[1] +; CHECK-NEXT: vmov.32 r4, d18[1] +; CHECK-NEXT: vld1.64 {d28, d29}, [r0:128]! +; CHECK-NEXT: vld1.64 {d26, d27}, [r5:128] +; CHECK-NEXT: vld1.64 {d30, d31}, [r6:128] +; CHECK-NEXT: vmov.32 r5, d17[0] +; CHECK-NEXT: vld1.64 {d22, d23}, [r0:128] +; CHECK-NEXT: vmov.32 r0, d17[1] +; CHECK-NEXT: vld1.64 {d24, d25}, [r1:128]! +; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128] +; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: subs r2, r2, r12 ; CHECK-NEXT: mov r12, #0 +; CHECK-NEXT: vmov.32 r2, d19[0] +; CHECK-NEXT: sbcs r6, r4, lr +; CHECK-NEXT: vmov.32 r4, d24[0] +; CHECK-NEXT: vmov.32 r6, d19[1] ; CHECK-NEXT: movlt r12, #1 ; CHECK-NEXT: cmp r12, #0 ; CHECK-NEXT: mvnne r12, #0 -; CHECK-NEXT: vmov.32 r5, d25[0] -; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: subs r2, r2, r5 +; CHECK-NEXT: vmov.32 r5, d28[0] ; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: sbcs r0, r4, r0 -; CHECK-NEXT: vmov.32 r1, d29[0] -; CHECK-NEXT: vmov.32 r0, d25[1] +; CHECK-NEXT: sbcs r0, r6, r0 +; CHECK-NEXT: vmov.32 r6, d28[1] +; CHECK-NEXT: vmov.32 r0, d24[1] ; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: vmov.32 r4, d29[1] ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d5, r2 -; CHECK-NEXT: subs r1, r1, r5 -; CHECK-NEXT: vmov.32 r5, d24[1] -; CHECK-NEXT: vmov.32 r1, d24[0] -; CHECK-NEXT: sbcs r0, r4, r0 -; CHECK-NEXT: vmov.32 r4, d28[0] +; CHECK-NEXT: vdup.32 d7, r2 +; CHECK-NEXT: vdup.32 d6, r12 +; CHECK-NEXT: subs r5, r5, r4 +; CHECK-NEXT: vmov.32 r4, d25[1] +; CHECK-NEXT: vmov.32 r5, d25[0] +; CHECK-NEXT: sbcs r0, r6, r0 +; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: vmov.32 r0, d29[0] +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: mvnne r6, #0 +; CHECK-NEXT: subs r0, r0, r5 +; CHECK-NEXT: vmov.32 r5, d21[0] +; CHECK-NEXT: vmov.32 r0, d29[1] +; CHECK-NEXT: sbcs r0, r0, r4 +; CHECK-NEXT: vmov.32 r4, d23[0] ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: vdup.32 d1, r0 -; CHECK-NEXT: vmov.32 r0, d19[0] -; CHECK-NEXT: subs r1, r4, r1 -; CHECK-NEXT: vmov.32 r4, d17[0] -; CHECK-NEXT: vmov.32 r1, d28[1] -; CHECK-NEXT: sbcs r1, r1, r5 -; CHECK-NEXT: vmov.32 r5, d17[1] -; CHECK-NEXT: mov r1, #0 -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: subs r0, r0, r4 -; CHECK-NEXT: vmov.32 r0, d19[1] -; CHECK-NEXT: vmov.32 r4, d22[0] -; CHECK-NEXT: vdup.32 d0, r1 -; CHECK-NEXT: vmov.32 r1, d22[1] -; CHECK-NEXT: vbsl q0, q14, q12 -; CHECK-NEXT: sbcs r0, r0, r5 -; CHECK-NEXT: vmov.32 r5, d26[0] ; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: vmov.32 r5, d20[0] -; CHECK-NEXT: vmov.32 r4, d26[1] -; CHECK-NEXT: sbcs r1, r4, r1 -; CHECK-NEXT: vmov.32 r4, d30[0] -; CHECK-NEXT: mov r1, #0 -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: subs r4, r4, r5 -; CHECK-NEXT: vmov.32 r5, d30[1] -; CHECK-NEXT: vmov.32 r4, d20[1] -; CHECK-NEXT: sbcs r4, r5, r4 -; CHECK-NEXT: vmov.32 r5, d27[0] -; CHECK-NEXT: vmov.32 r4, d23[0] -; CHECK-NEXT: movlt r6, #1 -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: vmov.32 r5, d27[1] +; CHECK-NEXT: vdup.32 d0, r6 +; CHECK-NEXT: vmov.32 r6, d22[0] +; CHECK-NEXT: vbit q12, q14, q0 +; CHECK-NEXT: subs r5, r4, r5 ; CHECK-NEXT: vmov.32 r4, d23[1] -; CHECK-NEXT: sbcs r4, r5, r4 -; CHECK-NEXT: movlt lr, #1 -; CHECK-NEXT: cmp lr, #0 -; CHECK-NEXT: mvnne lr, #0 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: mvnne r6, #0 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: vmov.32 r5, d21[1] +; CHECK-NEXT: sbcs r5, r4, r5 +; CHECK-NEXT: vmov.32 r4, d20[1] +; CHECK-NEXT: vmov.32 r5, d20[0] +; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vdup.32 d4, r6 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d3, lr -; CHECK-NEXT: vbsl q2, q15, q10 -; CHECK-NEXT: vdup.32 d21, r0 +; CHECK-NEXT: vdup.32 d5, r0 ; CHECK-NEXT: add r0, r3, #32 +; CHECK-NEXT: subs r6, r6, r5 +; CHECK-NEXT: vmov.32 r5, d26[0] +; CHECK-NEXT: vmov.32 r6, d22[1] +; CHECK-NEXT: sbcs r6, r6, r4 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: vmov.32 r6, d30[0] +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: subs r6, r6, r5 +; CHECK-NEXT: vmov.32 r5, d30[1] +; CHECK-NEXT: vmov.32 r6, d26[1] +; CHECK-NEXT: sbcs r6, r5, r6 +; CHECK-NEXT: vmov.32 r5, d31[0] +; CHECK-NEXT: vmov.32 r6, d27[0] +; CHECK-NEXT: movlt r1, #1 +; CHECK-NEXT: subs r6, r5, r6 +; CHECK-NEXT: vmov.32 r5, d31[1] +; CHECK-NEXT: vmov.32 r6, d27[1] +; CHECK-NEXT: sbcs r6, r5, r6 +; CHECK-NEXT: movlt r7, #1 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: mvnne r7, #0 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: vdup.32 d3, r7 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: vdup.32 d2, r1 -; CHECK-NEXT: vdup.32 d20, r12 -; CHECK-NEXT: vbsl q1, q13, q11 -; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128] +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: vbit q13, q15, q1 +; CHECK-NEXT: vdup.32 d4, r4 +; CHECK-NEXT: vbit q10, q11, q2 +; CHECK-NEXT: vbit q8, q9, q3 +; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128] ; CHECK-NEXT: add r0, r3, #48 -; CHECK-NEXT: vbsl q10, q9, q8 -; CHECK-NEXT: vst1.64 {d0, d1}, [r3:128]! -; CHECK-NEXT: vst1.64 {d2, d3}, [r0:128] +; CHECK-NEXT: vst1.64 {d24, d25}, [r3:128]! +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] ; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128] -; CHECK-NEXT: pop {r4, r5, r6, lr} +; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov pc, lr %v0 = load %T0_19, %T0_19* %loadaddr %v1 = load %T0_19, %T0_19* %loadaddr2 @@ -277,232 +275,233 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, sp, #8 -; CHECK-NEXT: add r9, r1, #64 -; CHECK-NEXT: mov r2, #32 -; CHECK-NEXT: add r8, r0, #64 -; CHECK-NEXT: vld1.64 {d18, d19}, [r9:128], r2 -; CHECK-NEXT: mov r10, #0 -; CHECK-NEXT: vld1.64 {d22, d23}, [r8:128], r2 -; CHECK-NEXT: vmov.32 r2, d19[0] +; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: vld1.64 {d16, d17}, [r8:128]! +; CHECK-NEXT: add r10, r0, #64 +; CHECK-NEXT: vld1.64 {d18, d19}, [r9:128]! +; CHECK-NEXT: vmov.32 r2, d16[0] ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vmov.32 r7, d23[0] -; CHECK-NEXT: mov r3, #0 -; CHECK-NEXT: vmov.32 r5, d19[1] -; CHECK-NEXT: vmov.32 r6, d23[1] -; CHECK-NEXT: vld1.64 {d2, d3}, [r9:128]! -; CHECK-NEXT: vmov.32 r12, d2[0] -; CHECK-NEXT: subs r2, r7, r2 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: vld1.64 {d20, d21}, [r7:128]! -; CHECK-NEXT: sbcs r2, r6, r5 -; CHECK-NEXT: vmov.32 r5, d18[0] +; CHECK-NEXT: vmov.32 r6, d18[0] +; CHECK-NEXT: vmov.32 r4, d16[1] +; CHECK-NEXT: vmov.32 r7, d18[1] +; CHECK-NEXT: vmov.32 r5, d17[0] +; CHECK-NEXT: subs r2, r6, r2 +; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: vmov.32 r2, d19[0] +; CHECK-NEXT: sbcs r7, r7, r4 +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: vmov.32 r7, d17[1] +; CHECK-NEXT: subs r2, r2, r5 +; CHECK-NEXT: vmov.32 r2, d19[1] +; CHECK-NEXT: sbcs r2, r2, r7 ; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: vmov.32 r6, d22[0] ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vld1.64 {d0, d1}, [r7:128] ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d17, r2 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: subs r5, r6, r5 -; CHECK-NEXT: vmov.32 r6, d22[1] -; CHECK-NEXT: vmov.32 r5, d18[1] -; CHECK-NEXT: sbcs r5, r6, r5 -; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: mvnne r5, #0 -; CHECK-NEXT: vdup.32 d16, r5 -; CHECK-NEXT: vbsl q8, q11, q9 -; CHECK-NEXT: vld1.64 {d22, d23}, [r2:128]! -; CHECK-NEXT: vmov.32 r5, d21[0] -; CHECK-NEXT: vmov.32 r6, d23[0] -; CHECK-NEXT: vld1.64 {d30, d31}, [r2:128] -; CHECK-NEXT: vmov.32 r2, d1[0] -; CHECK-NEXT: vmov.32 r7, d30[0] -; CHECK-NEXT: subs r5, r6, r5 -; CHECK-NEXT: vmov.32 r6, d23[1] -; CHECK-NEXT: vmov.32 r5, d21[1] -; CHECK-NEXT: sbcs r5, r6, r5 -; CHECK-NEXT: vmov.32 r6, d22[0] -; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: mvnne r5, #0 -; CHECK-NEXT: vdup.32 d19, r5 -; CHECK-NEXT: vmov.32 r5, d20[0] +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: vdup.32 d21, r2 +; CHECK-NEXT: mvnne r6, #0 +; CHECK-NEXT: vdup.32 d20, r6 +; CHECK-NEXT: mov r2, #32 +; CHECK-NEXT: add r6, r1, #64 +; CHECK-NEXT: vld1.64 {d24, d25}, [r10:128], r2 +; CHECK-NEXT: vbit q8, q9, q10 +; CHECK-NEXT: vld1.64 {d28, d29}, [r6:128], r2 +; CHECK-NEXT: vmov.32 r4, d29[0] +; CHECK-NEXT: vmov.32 r5, d25[0] +; CHECK-NEXT: vld1.64 {d0, d1}, [r9:128] +; CHECK-NEXT: vld1.64 {d2, d3}, [r8:128] +; CHECK-NEXT: vld1.64 {d22, d23}, [r6:128]! +; CHECK-NEXT: vld1.64 {d20, d21}, [r6:128] +; CHECK-NEXT: vmov.32 r6, d0[0] +; CHECK-NEXT: vld1.64 {d18, d19}, [r10:128]! +; CHECK-NEXT: vmov.32 r9, d23[0] +; CHECK-NEXT: vmov.32 r11, d19[0] +; CHECK-NEXT: vmov.32 r8, d23[1] +; CHECK-NEXT: subs r4, r5, r4 +; CHECK-NEXT: vmov.32 r5, d25[1] +; CHECK-NEXT: vmov.32 r4, d29[1] +; CHECK-NEXT: sbcs r4, r5, r4 +; CHECK-NEXT: vmov.32 r5, d24[0] +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: vdup.32 d5, r4 +; CHECK-NEXT: vmov.32 r4, d28[0] +; CHECK-NEXT: subs r4, r5, r4 +; CHECK-NEXT: vmov.32 r5, d24[1] +; CHECK-NEXT: vmov.32 r4, d28[1] +; CHECK-NEXT: sbcs r4, r5, r4 +; CHECK-NEXT: vmov.32 r5, d1[0] +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: vdup.32 d4, r4 +; CHECK-NEXT: vmov.32 r4, d3[0] +; CHECK-NEXT: subs r4, r5, r4 +; CHECK-NEXT: vmov.32 r5, d1[1] +; CHECK-NEXT: vmov.32 r4, d3[1] +; CHECK-NEXT: sbcs r4, r5, r4 +; CHECK-NEXT: add r5, r1, #32 +; CHECK-NEXT: vld1.64 {d26, d27}, [r5:128] +; CHECK-NEXT: add r5, r1, #48 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: add r1, r1, #80 +; CHECK-NEXT: vld1.64 {d30, d31}, [r5:128] +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: vbif q12, q14, q2 +; CHECK-NEXT: vmov.32 r5, d2[0] +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: vdup.32 d29, r4 +; CHECK-NEXT: vmov.32 r4, d31[1] ; CHECK-NEXT: subs r5, r6, r5 -; CHECK-NEXT: vmov.32 r6, d22[1] -; CHECK-NEXT: vmov.32 r5, d20[1] +; CHECK-NEXT: vmov.32 r6, d0[1] +; CHECK-NEXT: vmov.32 r5, d2[1] ; CHECK-NEXT: sbcs r5, r6, r5 +; CHECK-NEXT: add r6, r0, #48 ; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: vld1.64 {d6, d7}, [r6:128] ; CHECK-NEXT: movlt r5, #1 ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: mvnne r5, #0 -; CHECK-NEXT: vdup.32 d18, r5 -; CHECK-NEXT: add r5, r0, #32 -; CHECK-NEXT: vbsl q9, q11, q10 -; CHECK-NEXT: vld1.64 {d22, d23}, [r5:128] -; CHECK-NEXT: add r5, r1, #32 -; CHECK-NEXT: vld1.64 {d24, d25}, [r5:128] -; CHECK-NEXT: vmov.32 r5, d24[0] -; CHECK-NEXT: vmov.32 r6, d22[0] -; CHECK-NEXT: vmov.32 r4, d23[0] -; CHECK-NEXT: vld1.64 {d20, d21}, [r8:128]! -; CHECK-NEXT: vmov.32 r11, d21[0] -; CHECK-NEXT: subs r5, r6, r5 -; CHECK-NEXT: vmov.32 r6, d22[1] -; CHECK-NEXT: vmov.32 r5, d24[1] -; CHECK-NEXT: sbcs r5, r6, r5 -; CHECK-NEXT: vmov.32 r6, d25[0] -; CHECK-NEXT: movlt r10, #1 -; CHECK-NEXT: cmp r10, #0 -; CHECK-NEXT: mvnne r10, #0 -; CHECK-NEXT: subs r4, r4, r6 -; CHECK-NEXT: vmov.32 r6, d23[1] -; CHECK-NEXT: vmov.32 r4, d25[1] +; CHECK-NEXT: vmov.32 r7, d7[0] +; CHECK-NEXT: vdup.32 d28, r5 +; CHECK-NEXT: vmov.32 r5, d31[0] +; CHECK-NEXT: vbsl q14, q0, q1 +; CHECK-NEXT: vmov.32 r6, d7[1] +; CHECK-NEXT: vmov.32 r2, d6[0] +; CHECK-NEXT: subs r5, r7, r5 +; CHECK-NEXT: vmov.32 r7, d6[1] ; CHECK-NEXT: sbcs r4, r6, r4 -; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: vmov.32 r4, d31[0] -; CHECK-NEXT: movlt r6, #1 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: mvnne r6, #0 -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: vmov.32 r4, d31[1] -; CHECK-NEXT: vmov.32 r2, d1[1] -; CHECK-NEXT: sbcs r2, r4, r2 +; CHECK-NEXT: vmov.32 r6, d30[0] +; CHECK-NEXT: vmov.32 r5, d30[1] +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: vdup.32 d3, r4 +; CHECK-NEXT: vmov.32 r4, d26[1] +; CHECK-NEXT: subs r2, r2, r6 +; CHECK-NEXT: sbcs r2, r7, r5 +; CHECK-NEXT: add r5, r0, #32 ; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: vld1.64 {d0, d1}, [r5:128] ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d27, r2 -; CHECK-NEXT: add r2, r0, #48 -; CHECK-NEXT: vld1.64 {d4, d5}, [r2:128] -; CHECK-NEXT: add r2, r1, #48 +; CHECK-NEXT: vmov.32 r6, d0[0] +; CHECK-NEXT: vdup.32 d2, r2 ; CHECK-NEXT: add r0, r0, #80 -; CHECK-NEXT: add r1, r1, #80 -; CHECK-NEXT: vld1.64 {d6, d7}, [r2:128] -; CHECK-NEXT: vmov.32 r2, d7[0] -; CHECK-NEXT: vmov.32 r4, d5[0] -; CHECK-NEXT: vmov.32 r5, d4[0] +; CHECK-NEXT: vmov.32 r2, d26[0] +; CHECK-NEXT: vbit q15, q3, q1 +; CHECK-NEXT: vmov.32 r5, d0[1] +; CHECK-NEXT: vmov.32 r7, d1[0] +; CHECK-NEXT: vld1.64 {d2, d3}, [r10:128] +; CHECK-NEXT: vld1.64 {d6, d7}, [r1:128] ; CHECK-NEXT: vld1.64 {d8, d9}, [r0:128] -; CHECK-NEXT: subs r2, r4, r2 -; CHECK-NEXT: vmov.32 r4, d5[1] -; CHECK-NEXT: vmov.32 r2, d7[1] -; CHECK-NEXT: sbcs r2, r4, r2 -; CHECK-NEXT: vmov.32 r4, d0[0] +; CHECK-NEXT: vmov.32 r1, d7[1] +; CHECK-NEXT: vmov.32 r10, d19[1] +; CHECK-NEXT: vmov.32 lr, d6[0] +; CHECK-NEXT: vmov.32 r3, d8[0] +; CHECK-NEXT: vmov.32 r12, d8[1] +; CHECK-NEXT: subs r2, r6, r2 +; CHECK-NEXT: vmov.32 r6, d1[1] +; CHECK-NEXT: sbcs r2, r5, r4 +; CHECK-NEXT: vmov.32 r5, d27[0] +; CHECK-NEXT: vmov.32 r4, d27[1] ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d29, r2 -; CHECK-NEXT: vmov.32 r2, d6[1] -; CHECK-NEXT: subs r4, r7, r4 -; CHECK-NEXT: vmov.32 r7, d30[1] -; CHECK-NEXT: vmov.32 r4, d0[1] -; CHECK-NEXT: sbcs r4, r7, r4 -; CHECK-NEXT: vmov.32 r7, d4[1] +; CHECK-NEXT: subs r5, r7, r5 +; CHECK-NEXT: vmov.32 r7, d7[0] +; CHECK-NEXT: sbcs r4, r6, r4 +; CHECK-NEXT: vmov.32 r6, d2[0] ; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: vmov.32 r5, d2[1] ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d26, r4 -; CHECK-NEXT: vmov.32 r4, d6[0] -; CHECK-NEXT: vbsl q13, q15, q0 -; CHECK-NEXT: vld1.64 {d0, d1}, [r9:128] -; CHECK-NEXT: vdup.32 d31, r6 -; CHECK-NEXT: vmov.32 r9, d3[0] -; CHECK-NEXT: vdup.32 d30, r10 -; CHECK-NEXT: vmov.32 r10, d21[1] -; CHECK-NEXT: vbsl q15, q11, q12 -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: sbcs r2, r7, r2 -; CHECK-NEXT: vmov.32 r4, d0[1] -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d28, r2 -; CHECK-NEXT: vbsl q14, q2, q3 -; CHECK-NEXT: vld1.64 {d4, d5}, [r8:128] -; CHECK-NEXT: vmov.32 r2, d0[0] -; CHECK-NEXT: vmov.32 r6, d4[0] -; CHECK-NEXT: vmov.32 r5, d4[1] -; CHECK-NEXT: vld1.64 {d6, d7}, [r1:128] -; CHECK-NEXT: vmov.32 r7, d7[0] -; CHECK-NEXT: vmov.32 r1, d7[1] -; CHECK-NEXT: vmov.32 lr, d5[0] -; CHECK-NEXT: vmov.32 r8, d3[1] +; CHECK-NEXT: vdup.32 d5, r4 +; CHECK-NEXT: vdup.32 d4, r2 +; CHECK-NEXT: vmov.32 r2, d20[0] +; CHECK-NEXT: vbit q13, q0, q2 +; CHECK-NEXT: vmov.32 r4, d20[1] ; CHECK-NEXT: subs r0, r6, r2 ; CHECK-NEXT: vmov.32 r2, d9[1] ; CHECK-NEXT: sbcs r0, r5, r4 ; CHECK-NEXT: vmov.32 r4, d9[0] -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: mvnne r3, #0 -; CHECK-NEXT: vmov.32 r6, d8[1] -; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: vmov.32 r0, d5[1] +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vmov.32 r6, d18[0] +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vmov.32 r5, d18[1] ; CHECK-NEXT: subs r4, r4, r7 -; CHECK-NEXT: vmov.32 r7, d2[1] +; CHECK-NEXT: vmov.32 r7, d21[1] ; CHECK-NEXT: sbcs r1, r2, r1 -; CHECK-NEXT: vmov.32 r2, d8[0] -; CHECK-NEXT: vmov.32 r1, d6[0] -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: vmov.32 r4, d6[1] -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: mvnne r5, #0 -; CHECK-NEXT: vdup.32 d11, r5 -; CHECK-NEXT: vmov.32 r5, d20[0] -; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: vmov.32 r2, d1[0] -; CHECK-NEXT: sbcs r1, r6, r4 -; CHECK-NEXT: vmov.32 r6, d1[1] -; CHECK-NEXT: vmov.32 r4, d20[1] +; CHECK-NEXT: vmov.32 r4, d22[1] +; CHECK-NEXT: vmov.32 r1, d22[0] +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: mvnne r2, #0 +; CHECK-NEXT: vdup.32 d11, r2 +; CHECK-NEXT: vmov.32 r2, d3[1] +; CHECK-NEXT: subs r1, r6, r1 +; CHECK-NEXT: vmov.32 r6, d21[0] +; CHECK-NEXT: sbcs r1, r5, r4 +; CHECK-NEXT: vmov.32 r4, d3[0] +; CHECK-NEXT: vmov.32 r5, d6[1] ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: vdup.32 d10, r1 -; CHECK-NEXT: mov r1, #0 -; CHECK-NEXT: vbsl q5, q4, q3 -; CHECK-NEXT: subs r2, lr, r2 -; CHECK-NEXT: sbcs r0, r0, r6 -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: subs r2, r5, r12 -; CHECK-NEXT: sbcs r2, r4, r7 +; CHECK-NEXT: subs r4, r4, r6 +; CHECK-NEXT: sbcs r2, r2, r7 ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: subs r7, r11, r9 -; CHECK-NEXT: sbcs r7, r10, r8 -; CHECK-NEXT: movlt r1, #1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: subs r4, r11, r9 +; CHECK-NEXT: sbcs r4, r10, r8 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: subs r3, r3, lr +; CHECK-NEXT: sbcs r3, r12, r5 +; CHECK-NEXT: mov r3, #0 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: mvnne r3, #0 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: vdup.32 d10, r3 +; CHECK-NEXT: vdup.32 d1, r4 +; CHECK-NEXT: vorr q2, q5, q5 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vdup.32 d23, r1 +; CHECK-NEXT: vdup.32 d0, r1 +; CHECK-NEXT: vbsl q2, q4, q3 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vbif q9, q11, q0 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d22, r2 -; CHECK-NEXT: vdup.32 d25, r0 +; CHECK-NEXT: vdup.32 d7, r2 +; CHECK-NEXT: vdup.32 d6, r0 ; CHECK-NEXT: add r0, r1, #80 -; CHECK-NEXT: vbsl q11, q10, q1 -; CHECK-NEXT: vdup.32 d24, r3 -; CHECK-NEXT: vst1.64 {d10, d11}, [r0:128] +; CHECK-NEXT: vbit q10, q1, q3 +; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128] ; CHECK-NEXT: add r0, r1, #32 -; CHECK-NEXT: vbsl q12, q2, q0 -; CHECK-NEXT: vst1.64 {d30, d31}, [r0:128] +; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128] ; CHECK-NEXT: add r0, r1, #48 -; CHECK-NEXT: vst1.64 {d28, d29}, [r0:128] +; CHECK-NEXT: vst1.64 {d30, d31}, [r0:128] ; CHECK-NEXT: add r0, r1, #64 -; CHECK-NEXT: vst1.64 {d18, d19}, [r1:128]! -; CHECK-NEXT: vst1.64 {d26, d27}, [r1:128] +; CHECK-NEXT: vst1.64 {d16, d17}, [r1:128]! +; CHECK-NEXT: vst1.64 {d28, d29}, [r1:128] ; CHECK-NEXT: mov r1, #32 -; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128], r1 -; CHECK-NEXT: vst1.64 {d22, d23}, [r0:128]! -; CHECK-NEXT: vst1.64 {d24, d25}, [r0:128] +; CHECK-NEXT: vst1.64 {d24, d25}, [r0:128], r1 +; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128] ; CHECK-NEXT: add sp, sp, #8 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, sp, #4 Index: llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll =================================================================== --- llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll +++ llvm/test/CodeGen/Thumb2/float-intrinsics-double.ll @@ -129,8 +129,7 @@ ; VFP: bfi r1, [[REG]], #31, #1 ; NEON: vmov.i32 d16, #0x80000000 ; NEON-NEXT: vshl.i64 d16, d16, #32 -; NEON-NEXT: vbsl d16, d1, d0 -; NEON-NEXT: vorr d0, d16, d16 +; NEON-NEXT: vbit d0, d1, d16 ; NEON-NEXT: bx lr %1 = call double @llvm.copysign.f64(double %a, double %b) ret double %1 Index: llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll =================================================================== --- llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll +++ llvm/test/CodeGen/Thumb2/float-intrinsics-float.ll @@ -124,18 +124,18 @@ ; VFP: lsrs [[REG:r[0-9]+]], r{{[0-9]+}}, #31 ; VFP: bfi r{{[0-9]+}}, [[REG]], #31, #1 ; NEON-A7: @ %bb.0: -; NEON-A7-NEXT: vmov.f32 s4, s1 +; NEON-A7-NEXT: vmov.f32 s2, s1 ; NEON-A7-NEXT: @ kill: def $s0 killed $s0 def $d0 -; NEON-A7-NEXT: vmov.i32 d1, #0x80000000 -; NEON-A7-NEXT: vbsl d1, d2, d0 -; NEON-A7-NEXT: vmov.f32 s0, s2 +; NEON-A7-NEXT: vmov.i32 d16, #0x80000000 +; NEON-A7-NEXT: vbit d0, d1, d16 +; NEON-A7-NEXT: @ kill: def $s0 killed $s0 killed $d0 ; NEON-A7-NEXT: bx lr ; NEON-A57: @ %bb.0: -; NEON-A57-NEXT: vmov.f32 s4, s1 -; NEON-A57-NEXT: vmov.i32 d1, #0x80000000 +; NEON-A57-NEXT: vmov.f32 s2, s1 +; NEON-A57-NEXT: vmov.i32 d16, #0x80000000 ; NEON-A57-NEXT: @ kill: def $s0 killed $s0 def $d0 -; NEON-A57-NEXT: vbsl d1, d2, d0 -; NEON-A57-NEXT: vmov.f32 s0, s2 +; NEON-A57-NEXT: vbit d0, d1, d16 +; NEON-A57-NEXT: @ kill: def $s0 killed $s0 killed $d0 ; NEON-A57-NEXT: bx lr %1 = call float @llvm.copysign.f32(float %a, float %b) ret float %1 Index: llvm/test/MC/ARM/neon-bitwise-encoding.s =================================================================== --- llvm/test/MC/ARM/neon-bitwise-encoding.s +++ llvm/test/MC/ARM/neon-bitwise-encoding.s @@ -101,10 +101,17 @@ vbsl d18, d17, d16 vbsl q8, q10, q9 + vbit d18, d17, d16 + vbit q8, q10, q9 + vbif d18, d17, d16 + vbif q8, q10, q9 @ CHECK: vbsl d18, d17, d16 @ encoding: [0xb0,0x21,0x51,0xf3] @ CHECK: vbsl q8, q10, q9 @ encoding: [0xf2,0x01,0x54,0xf3] - +@ CHECK: vbit d18, d17, d16 @ encoding: [0xb0,0x21,0x61,0xf3] +@ CHECK: vbit q8, q10, q9 @ encoding: [0xf2,0x01,0x64,0xf3] +@ CHECK: vbif d18, d17, d16 @ encoding: [0xb0,0x21,0x71,0xf3] +@ CHECK: vbif q8, q10, q9 @ encoding: [0xf2,0x01,0x74,0xf3] @ Size suffices are optional. veor q4, q7, q3 Index: llvm/test/MC/ARM/neont2-bitwise-encoding.s =================================================================== --- llvm/test/MC/ARM/neont2-bitwise-encoding.s +++ llvm/test/MC/ARM/neont2-bitwise-encoding.s @@ -50,6 +50,14 @@ vbsl d18, d17, d16 vbsl q8, q10, q9 + vbit d18, d17, d16 + vbit q8, q10, q9 + vbif d18, d17, d16 + vbif q8, q10, q9 @ CHECK: vbsl d18, d17, d16 @ encoding: [0x51,0xff,0xb0,0x21] @ CHECK: vbsl q8, q10, q9 @ encoding: [0x54,0xff,0xf2,0x01] +@ CHECK: vbit d18, d17, d16 @ encoding: [0x61,0xff,0xb0,0x21] +@ CHECK: vbit q8, q10, q9 @ encoding: [0x64,0xff,0xf2,0x01] +@ CHECK: vbif d18, d17, d16 @ encoding: [0x71,0xff,0xb0,0x21] +@ CHECK: vbif q8, q10, q9 @ encoding: [0x74,0xff,0xf2,0x01] Index: llvm/test/MC/Disassembler/ARM/neon-tests.txt =================================================================== --- llvm/test/MC/Disassembler/ARM/neon-tests.txt +++ llvm/test/MC/Disassembler/ARM/neon-tests.txt @@ -3,6 +3,9 @@ # CHECK: vbif q15, q7, q0 0x50 0xe1 0x7e 0xf3 +# CHECK: vbit q15, q7, q0 +0x50 0xe1 0x6e 0xf3 + # CHECK: vcvt.f32.s32 q15, q0, #1 0x50 0xee 0xff 0xf2 Index: llvm/test/MC/Disassembler/ARM/neon.txt =================================================================== --- llvm/test/MC/Disassembler/ARM/neon.txt +++ llvm/test/MC/Disassembler/ARM/neon.txt @@ -326,6 +326,15 @@ 0xf2 0x01 0x54 0xf3 # CHECK: vbsl q8, q10, q9 +0xb0 0x21 0x61 0xf3 +# CHECK: vbit d18, d17, d16 +0xf2 0x01 0x64 0xf3 +# CHECK: vbit q8, q10, q9 + +0xb0 0x21 0x71 0xf3 +# CHECK: vbif d18, d17, d16 +0xf2 0x01 0x74 0xf3 +# CHECK: vbif q8, q10, q9 # CHECK: vceq.i8 d16, d16, d17 # CHECK: vceq.i16 d16, d16, d17 Index: llvm/test/MC/Disassembler/ARM/neont2.txt =================================================================== --- llvm/test/MC/Disassembler/ARM/neont2.txt +++ llvm/test/MC/Disassembler/ARM/neont2.txt @@ -320,6 +320,16 @@ 0x54 0xff 0xf2 0x01 # CHECK: vbsl q8, q10, q9 +0x61 0xff 0xb0 0x21 +# CHECK: vbit d18, d17, d16 +0x64 0xff 0xf2 0x01 +# CHECK: vbit q8, q10, q9 + +0x71 0xff 0xb0 0x21 +# CHECK: vbif d18, d17, d16 +0x74 0xff 0xf2 0x01 +# CHECK: vbif q8, q10, q9 + 0xfb 0xff 0x20 0x07 # CHECK: vcvt.s32.f32 d16, d16 0xfb 0xff 0xa0 0x07