Index: lib/Target/ARM/ARMSelectionDAGInfo.h =================================================================== --- lib/Target/ARM/ARMSelectionDAGInfo.h +++ lib/Target/ARM/ARMSelectionDAGInfo.h @@ -48,6 +48,13 @@ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const override; + SDValue EmitTargetCodeForMemmove(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; + // Adjust parameters for memset, see RTABI section 4.3.4 SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, @@ -55,6 +62,12 @@ SDValue Op3, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const override; + + SDValue EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + RTLIB::Libcall LC) const; }; } Index: lib/Target/ARM/ARMSelectionDAGInfo.cpp =================================================================== --- lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -24,6 +24,116 @@ ARMSelectionDAGInfo::~ARMSelectionDAGInfo() { } +// Emit, if possible, a specialized version of the given Libcall. Typically this +// means selecting the appropriately aligned version, but we also convert memset +// of 0 into memclr. +SDValue ARMSelectionDAGInfo:: +EmitSpecializedLibcall(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + RTLIB::Libcall LC) const +{ + const ARMSubtarget &Subtarget = + DAG.getMachineFunction().getSubtarget(); + const ARMTargetLowering *TLI = Subtarget.getTargetLowering(); + + // Only use a specialized AEABI function if the default version of this + // Libcall is an AEABI function. + if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0) + return SDValue(); + + // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be + // able to translate memset to memclr and use the value to index the function + // name array. + enum { + AEABI_MEMCPY = 0, + AEABI_MEMMOVE, + AEABI_MEMSET, + AEABI_MEMCLR + } AEABILibcall; + switch (LC) { + case RTLIB::MEMCPY: + AEABILibcall = AEABI_MEMCPY; + break; + case RTLIB::MEMMOVE: + AEABILibcall = AEABI_MEMMOVE; + break; + case RTLIB::MEMSET: + AEABILibcall = AEABI_MEMSET; + if (ConstantSDNode *ConstantSrc = dyn_cast(Src)) + if (ConstantSrc->getZExtValue() == 0) + AEABILibcall = AEABI_MEMCLR; + break; + default: + return SDValue(); + } + + // Choose the most-aligned libcall variant that we can + enum { + ALIGN1 = 0, + ALIGN4, + ALIGN8 + } AlignVariant; + if ((Align & 7) == 0) { + AlignVariant = ALIGN8; + } else if ((Align & 3) == 0) { + AlignVariant = ALIGN4; + } else { + AlignVariant = ALIGN1; + } + + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = TLI->getDataLayout()->getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; + Args.push_back(Entry); + if (AEABILibcall == AEABI_MEMCLR) { + Entry.Node = Size; + Args.push_back(Entry); + } else if (AEABILibcall == AEABI_MEMSET) { + // Adjust parameters for memset, EABI uses format (ptr, size, value), + // GNU library uses (ptr, value, size) + // See RTABI section 4.3.4 + Entry.Node = Size; + Args.push_back(Entry); + + // Extend or truncate the argument to be an i32 value for the call. + if (Src.getValueType().bitsGT(MVT::i32)) + Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); + else + Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); + + Entry.Node = Src; + Entry.Ty = Type::getInt32Ty(*DAG.getContext()); + Entry.isSExt = false; + Args.push_back(Entry); + } else { + Entry.Node = Src; + Args.push_back(Entry); + + Entry.Node = Size; + Args.push_back(Entry); + } + + char const *FunctionNames[4][3] = { + { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" }, + { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" }, + { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" }, + { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" } + }; + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(Chain) + .setCallee(TLI->getLibcallCallingConv(LC), + Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant], + TLI->getPointerTy()), std::move(Args), 0) + .setDiscardResult(); + std::pair CallResult = TLI->LowerCallTo(CLI); + + return CallResult.second; +} + SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, @@ -42,10 +152,12 @@ // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast(Size); if (!ConstantSize) - return SDValue(); + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, + RTLIB::MEMCPY); uint64_t SizeVal = ConstantSize->getZExtValue(); if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) - return SDValue(); + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, + RTLIB::MEMCPY); unsigned BytesLeft = SizeVal & 3; unsigned NumMemOps = SizeVal >> 2; @@ -142,59 +254,26 @@ makeArrayRef(TFOps, i)); } -// Adjust parameters for memset, EABI uses format (ptr, size, value), -// GNU library uses (ptr, value, size) -// See RTABI section 4.3.4 + +SDValue ARMSelectionDAGInfo:: +EmitTargetCodeForMemmove(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const { + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, + RTLIB::MEMMOVE); +} + + SDValue ARMSelectionDAGInfo:: EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const { - const ARMSubtarget &Subtarget = - DAG.getMachineFunction().getSubtarget(); - // Use default for non-AAPCS (or MachO) subtargets - if (!Subtarget.isAAPCS_ABI() || Subtarget.isTargetMachO() || - Subtarget.isTargetWindows()) - return SDValue(); - - const ARMTargetLowering &TLI = *Subtarget.getTargetLowering(); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - - // First argument: data pointer - Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*DAG.getContext()); - Entry.Node = Dst; - Entry.Ty = IntPtrTy; - Args.push_back(Entry); - - // Second argument: buffer size - Entry.Node = Size; - Entry.Ty = IntPtrTy; - Entry.isSExt = false; - Args.push_back(Entry); - - // Extend or truncate the argument to be an i32 value for the call. - if (Src.getValueType().bitsGT(MVT::i32)) - Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); - else - Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); - - // Third argument: value to fill - Entry.Node = Src; - Entry.Ty = Type::getInt32Ty(*DAG.getContext()); - Entry.isSExt = true; - Args.push_back(Entry); - - // Emit __eabi_memset call - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMSET), - Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET), - TLI.getPointerTy()), std::move(Args), 0) - .setDiscardResult(); - - std::pair CallResult = TLI.LowerCallTo(CLI); - return CallResult.second; + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, + RTLIB::MEMSET); } Index: test/CodeGen/ARM/memfunc.ll =================================================================== --- test/CodeGen/ARM/memfunc.ll +++ test/CodeGen/ARM/memfunc.ll @@ -18,13 +18,64 @@ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 0, i1 false) ; EABI memset swaps arguments + ; CHECK-IOS: mov r1, #1 + ; CHECK-IOS: memset + ; CHECK-DARWIN: movs r1, #1 + ; CHECK-DARWIN: memset + ; CHECK-EABI: mov r2, #1 + ; CHECK-EABI: __aeabi_memset + call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 500, i32 0, i1 false) + + ; EABI uses memclr if value set to 0 ; CHECK-IOS: mov r1, #0 ; CHECK-IOS: memset ; CHECK-DARWIN: movs r1, #0 ; CHECK-DARWIN: memset - ; CHECK-EABI: mov r2, #0 - ; CHECK-EABI: __aeabi_memset + ; CHECK-EABI: __aeabi_memclr call void @llvm.memset.p0i8.i32(i8* %dest, i8 0, i32 500, i32 0, i1 false) + + ; EABI uses aligned function variants if possible + + ; CHECK-IOS: memmove + ; CHECK-DARWIN: memmove + ; CHECK-EABI: __aeabi_memmove4 + call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 4, i1 false) + + ; CHECK-IOS: memcpy + ; CHECK-DARWIN: memcpy + ; CHECK-EABI: __aeabi_memcpy4 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 4, i1 false) + + ; CHECK-IOS: memset + ; CHECK-DARWIN: memset + ; CHECK-EABI: __aeabi_memset4 + call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 500, i32 4, i1 false) + + ; CHECK-IOS: memset + ; CHECK-DARWIN: memset + ; CHECK-EABI: __aeabi_memclr4 + call void @llvm.memset.p0i8.i32(i8* %dest, i8 0, i32 500, i32 4, i1 false) + + ; CHECK-IOS: memmove + ; CHECK-DARWIN: memmove + ; CHECK-EABI: __aeabi_memmove8 + call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 8, i1 false) + + ; CHECK-IOS: memcpy + ; CHECK-DARWIN: memcpy + ; CHECK-EABI: __aeabi_memcpy8 + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 500, i32 8, i1 false) + + ; CHECK-IOS: memset + ; CHECK-DARWIN: memset + ; CHECK-EABI: __aeabi_memset8 + call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 500, i32 8, i1 false) + + ; CHECK-IOS: memset + ; CHECK-DARWIN: memset + ; CHECK-EABI: __aeabi_memclr8 + call void @llvm.memset.p0i8.i32(i8* %dest, i8 0, i32 500, i32 8, i1 false) + unreachable } @@ -53,17 +104,17 @@ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) ; CHECK-IOS: mov r0, sp - ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: mov r1, #1 ; CHECK-IOS: memset - ; CHECK-DARINW: add r0, sp, #4 - ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: add r0, sp, #4 + ; CHECK-DARWIN: movs r1, #1 ; CHECK-DARWIN: memset ; CHECK-EABI: add r0, sp, #4 - ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: mov r2, #1 ; CHECK-EABI: __aeabi_memset %arr2 = alloca [9 x i8], align 1 %2 = bitcast [9 x i8]* %arr2 to i8* - call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false) unreachable } @@ -90,15 +141,15 @@ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) ; CHECK: {{add(.w)? r0, sp, #3}} - ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: mov r1, #1 ; CHECK-IOS: memset - ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: movs r1, #1 ; CHECK-DARWIN: memset - ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: mov r2, #1 ; CHECK-EABI: __aeabi_memset %arr2 = alloca [7 x i8], align 1 %2 = bitcast [7 x i8]* %arr2 to i8* - call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false) unreachable } @@ -125,15 +176,15 @@ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) ; CHECK: {{add(.w)? r., sp, #(1|5)}} - ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: mov r1, #1 ; CHECK-IOS: memset - ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: movs r1, #1 ; CHECK-DARWIN: memset - ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: mov r2, #1 ; CHECK-EABI: __aeabi_memset %arr2 = alloca [9 x i8], align 1 %2 = getelementptr inbounds [9 x i8], [9 x i8]* %arr2, i32 0, i32 4 - call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false) unreachable } @@ -160,15 +211,15 @@ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) ; CHECK: {{add(.w)? r., sp, #(1|5)}} - ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: mov r1, #1 ; CHECK-IOS: memset - ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: movs r1, #1 ; CHECK-DARWIN: memset - ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: mov r2, #1 ; CHECK-EABI: __aeabi_memset %arr2 = alloca [13 x i8], align 1 %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 1 - call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false) unreachable } @@ -195,15 +246,15 @@ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) ; CHECK: {{add(.w)? r., sp, #(1|5)}} - ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: mov r1, #1 ; CHECK-IOS: memset - ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: movs r1, #1 ; CHECK-DARWIN: memset - ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: mov r2, #1 ; CHECK-EABI: __aeabi_memset %arr2 = alloca [13 x i8], align 1 %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 %i - call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false) unreachable } @@ -230,15 +281,15 @@ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) ; CHECK: {{add(.w)? r., sp, #(1|5)}} - ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: mov r1, #1 ; CHECK-IOS: memset - ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: movs r1, #1 ; CHECK-DARWIN: memset - ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: mov r2, #1 ; CHECK-EABI: __aeabi_memset %arr2 = alloca [13 x i8], align 1 %2 = getelementptr [13 x i8], [13 x i8]* %arr2, i32 0, i32 4 - call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false) unreachable } @@ -265,15 +316,15 @@ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i32 0, i1 false) ; CHECK: {{add(.w)? r., sp, #(1|5)}} - ; CHECK-IOS: mov r1, #0 + ; CHECK-IOS: mov r1, #1 ; CHECK-IOS: memset - ; CHECK-DARWIN: movs r1, #0 + ; CHECK-DARWIN: movs r1, #1 ; CHECK-DARWIN: memset - ; CHECK-EABI: mov r2, #0 + ; CHECK-EABI: mov r2, #1 ; CHECK-EABI: __aeabi_memset %arr2 = alloca [13 x i8], align 1 %2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 16 - call void @llvm.memset.p0i8.i32(i8* %2, i8 0, i32 %n, i32 0, i1 false) + call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i32 0, i1 false) unreachable }