Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5775,6 +5775,7 @@ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { // Turn a memcpy of undef to nop. + // FIXME: We need to honor volatile even is Src is undef. if (Src.isUndef()) return Chain; @@ -5801,13 +5802,12 @@ bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr; unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize); - if (!TLI.findOptimalMemOpLowering(MemOps, Limit, Size, - (DstAlignCanChange ? 0 : Align), - (isZeroConstant ? 0 : SrcAlign), - false, false, CopyFromConstant, true, - DstPtrInfo.getAddrSpace(), - SrcPtrInfo.getAddrSpace(), - MF.getFunction().getAttributes())) + if (!TLI.findOptimalMemOpLowering( + MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), + (isZeroConstant ? 0 : SrcAlign), /*IsMemset=*/false, + /*ZeroMemset=*/false, /*MemcpyStrSrc=*/CopyFromConstant, + /*AllowOverlap=*/!isVol, DstPtrInfo.getAddrSpace(), + SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes())) return SDValue(); if (DstAlignCanChange) { @@ -5961,6 +5961,7 @@ MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) { // Turn a memmove of undef to nop. + // FIXME: We need to honor volatile even is Src is undef. if (Src.isUndef()) return Chain; @@ -5981,13 +5982,15 @@ if (Align > SrcAlign) SrcAlign = Align; unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize); - - if (!TLI.findOptimalMemOpLowering(MemOps, Limit, Size, - (DstAlignCanChange ? 0 : Align), SrcAlign, - false, false, false, false, - DstPtrInfo.getAddrSpace(), - SrcPtrInfo.getAddrSpace(), - MF.getFunction().getAttributes())) + // FIXME: `AllowOverlap` should really be `!isVol` but there is a bug in + // findOptimalMemOpLowering. Meanwhile, setting it to `false` produces the + // correct code. + bool AllowOverlap = false; + if (!TLI.findOptimalMemOpLowering( + MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), SrcAlign, + /*IsMemset=*/false, /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false, + AllowOverlap, DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), + MF.getFunction().getAttributes())) return SDValue(); if (DstAlignCanChange) { @@ -6066,6 +6069,7 @@ uint64_t Size, unsigned Align, bool isVol, MachinePointerInfo DstPtrInfo) { // Turn a memset of undef to nop. + // FIXME: We need to honor volatile even is Src is undef. if (Src.isUndef()) return Chain; @@ -6082,11 +6086,12 @@ DstAlignCanChange = true; bool IsZeroVal = isa(Src) && cast(Src)->isNullValue(); - if (!TLI.findOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize), - Size, (DstAlignCanChange ? 0 : Align), 0, - true, IsZeroVal, false, true, - DstPtrInfo.getAddrSpace(), ~0u, - MF.getFunction().getAttributes())) + if (!TLI.findOptimalMemOpLowering( + MemOps, TLI.getMaxStoresPerMemset(OptSize), Size, + (DstAlignCanChange ? 0 : Align), 0, /*IsMemset=*/true, + /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false, + /*AllowOverlap=*/!isVol, DstPtrInfo.getAddrSpace(), ~0u, + MF.getFunction().getAttributes())) return SDValue(); if (DstAlignCanChange) { Index: llvm/trunk/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll +++ llvm/trunk/test/CodeGen/X86/volatile-memstores-nooverlapping-load-stores.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1 immarg) #1 +define dso_local void @copy_7_bytes(i8* noalias nocapture, i8* noalias nocapture readonly) nounwind #0 { +; CHECK-LABEL: copy_7_bytes: +; CHECK: # %bb.0: +; CHECK-NEXT: movl (%rsi), %eax +; CHECK-NEXT: movl 3(%rsi), %ecx +; CHECK-NEXT: movl %ecx, 3(%rdi) +; CHECK-NEXT: movl %eax, (%rdi) +; CHECK-NEXT: retq + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 7, i1 false) + ret void +} +define dso_local void @copy_7_bytes_volatile(i8* noalias nocapture, i8* noalias nocapture readonly) nounwind #0 { +; CHECK-LABEL: copy_7_bytes_volatile: +; CHECK: # %bb.0: +; CHECK-NEXT: movb 6(%rsi), %al +; CHECK-NEXT: movb %al, 6(%rdi) +; CHECK-NEXT: movzwl 4(%rsi), %eax +; CHECK-NEXT: movw %ax, 4(%rdi) +; CHECK-NEXT: movl (%rsi), %eax +; CHECK-NEXT: movl %eax, (%rdi) +; CHECK-NEXT: retq + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 7, i1 true) + ret void +} + + +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1 immarg) #1 +define dso_local void @move_7_bytes(i8* nocapture, i8* nocapture readonly) nounwind #0 { +; CHECK-LABEL: move_7_bytes: +; CHECK: # %bb.0: +; CHECK-NEXT: movl (%rsi), %eax +; CHECK-NEXT: movzwl 4(%rsi), %ecx +; CHECK-NEXT: movb 6(%rsi), %dl +; CHECK-NEXT: movb %dl, 6(%rdi) +; CHECK-NEXT: movw %cx, 4(%rdi) +; CHECK-NEXT: movl %eax, (%rdi) +; CHECK-NEXT: retq + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 7, i1 false) + ret void +} +define dso_local void @move_7_bytes_volatile(i8* nocapture, i8* nocapture readonly) nounwind #0 { +; CHECK-LABEL: move_7_bytes_volatile: +; CHECK: # %bb.0: +; CHECK-NEXT: movl (%rsi), %eax +; CHECK-NEXT: movzwl 4(%rsi), %ecx +; CHECK-NEXT: movb 6(%rsi), %dl +; CHECK-NEXT: movb %dl, 6(%rdi) +; CHECK-NEXT: movw %cx, 4(%rdi) +; CHECK-NEXT: movl %eax, (%rdi) +; CHECK-NEXT: retq + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 %1, i64 7, i1 true) + ret void +} + + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8 , i64, i1 immarg) #1 +define dso_local void @set_7_bytes(i8* noalias nocapture) nounwind #0 { +; CHECK-LABEL: set_7_bytes: +; CHECK: # %bb.0: +; CHECK-NEXT: movl $16843009, 3(%rdi) # imm = 0x1010101 +; CHECK-NEXT: movl $16843009, (%rdi) # imm = 0x1010101 +; CHECK-NEXT: retq + tail call void @llvm.memset.p0i8.i64(i8* align 1 %0, i8 1, i64 7, i1 false) + ret void +} +define dso_local void @set_7_bytes_volatile(i8* noalias nocapture) nounwind #0 { +; CHECK-LABEL: set_7_bytes_volatile: +; CHECK: # %bb.0: +; CHECK-NEXT: movb $1, 6(%rdi) +; CHECK-NEXT: movw $257, 4(%rdi) # imm = 0x101 +; CHECK-NEXT: movl $16843009, (%rdi) # imm = 0x1010101 +; CHECK-NEXT: retq + tail call void @llvm.memset.p0i8.i64(i8* align 1 %0, i8 1, i64 7, i1 true) + ret void +} + +attributes #0 = { noreturn nounwind uwtable "target-cpu"="x86-64" } +attributes #1 = { argmemonly nounwind }