Consider the following code.
#include <memory> #include <vector> class TestClass { public: TestClass(size_t size) : Data(size) { } private: std::vector<char> Data; }; int main(void) { std::unique_ptr<TestClass> test; for (int i = 0; i < 100000; ++i) test.reset(new TestClass(0x10000)); return 0; }
For clang 5.0.1 it works for 14sec on my laptop. If you replace char by short it becomes 35 times faster(wow). The main difference in the generated code that for char no memset is called inside __construct_at_end function.
By manipulating a local variable in the loop, this lets it be fully optimized away.
Prior to this change, this would be generated (on x86-64):
51,79c58,66 < movq %rax, 8(%rbx) < movq %rax, (%rbx) < movq %rax, %rcx < addq $65536, %rcx # imm = 0x10000 < movq %rcx, 16(%rbx) < movq $-65536, %rcx # imm = 0xFFFFFFFFFFFF0000 < .align 16, 0x90 < .LBB0_4: # Parent Loop BB0_1 Depth=1 < # => This Inner Loop Header: Depth=2 < movb $0, (%rax) < movq 8(%rbx), %rax < leaq 1(%rax), %rdx < movq %rdx, 8(%rbx) < movb $0, 1(%rax) < movq 8(%rbx), %rax < leaq 1(%rax), %rdx < movq %rdx, 8(%rbx) < movb $0, 1(%rax) < movq 8(%rbx), %rax < leaq 1(%rax), %rdx < movq %rdx, 8(%rbx) < movb $0, 1(%rax) < movq 8(%rbx), %rax < incq %rax < movq %rax, 8(%rbx) < addq $4, %rcx < jne .LBB0_4 < # BB#5: # %_ZN9TestClassC2Em.exit < # in Loop: Header=BB0_1 Depth=1 --- > movq %rax, (%r12) > movq %rax, %rbx > addq $65536, %rbx # imm = 0x10000 > movq %rbx, 16(%r12) > xorl %esi, %esi > movl $65536, %edx # imm = 0x10000 > movq %rax, %rdi > callq memset > movq %rbx, 8(%r12) 81,82c68,69
I have been asked specifically by the optimizer folks to NOT do things like this in libc++, but rather to file bugs against the optimizer.
And I have done so for this exact case: https://bugs.llvm.org/show_bug.cgi?id=35637