Index: lib/Target/X86/X86InstrCompiler.td =================================================================== --- lib/Target/X86/X86InstrCompiler.td +++ lib/Target/X86/X86InstrCompiler.td @@ -752,6 +752,8 @@ /* The following multiclass tries to make sure that in code like * x.store (immediate op x.load(acquire), release) + * and + * x.store (register op x.load(acquire), release) * an operation directly on memory is generated instead of wasting a register. * It is not automatic as atomic_store/load are only lowered to MOV instructions * extremely late to prevent them from being accidentally reordered in the backend @@ -762,16 +764,28 @@ "#RELEASE_BINOP PSEUDO!", [(atomic_store_8 addr:$dst, (!cast(op) (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_8 addr:$dst, (!cast(op) + (atomic_load_8 addr:$dst), GR8:$src))]>; // NAME#16 is not generated as 16-bit arithmetic instructions are considered // costly and avoided as far as possible by this backend anyway def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), "#RELEASE_BINOP PSEUDO!", [(atomic_store_32 addr:$dst, (!cast(op) (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_32 addr:$dst, (!cast(op) + (atomic_load_32 addr:$dst), GR32:$src))]>; def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), "#RELEASE_BINOP PSEUDO!", [(atomic_store_64 addr:$dst, (!cast(op) (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_64 addr:$dst, (!cast(op) + (atomic_load_64 addr:$dst), GR64:$src))]>; } defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; defm RELEASE_AND : RELEASE_BINOP_MI<"and">; Index: lib/Target/X86/X86MCInstLower.cpp =================================================================== --- lib/Target/X86/X86MCInstLower.cpp +++ lib/Target/X86/X86MCInstLower.cpp @@ -598,17 +598,29 @@ case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify; case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify; case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify; + case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify; case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify; + case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify; case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify; + case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify; case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify; + case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify; case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify; + case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify; case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify; + case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify; case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify; + case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify; case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify; + case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify; case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify; + case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify; case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify; + case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify; case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify; + case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify; case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify; + case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify; case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify; case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify; case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify; Index: test/CodeGen/X86/atomic_mi.ll =================================================================== --- test/CodeGen/X86/atomic_mi.ll +++ test/CodeGen/X86/atomic_mi.ll @@ -94,12 +94,12 @@ ; ----- ADD ----- -define void @add_8(i8* %p) { -; X64-LABEL: add_8 +define void @add_8i(i8* %p) { +; X64-LABEL: add_8i ; X64-NOT: lock ; X64: addb ; X64-NOT: movb -; X32-LABEL: add_8 +; X32-LABEL: add_8i ; X32-NOT: lock ; X32: addb ; X32-NOT: movb @@ -109,12 +109,27 @@ ret void } -define void @add_16(i16* %p) { +define void @add_8r(i8* %p, i8 %v) { +; X64-LABEL: add_8r +; X64-NOT: lock +; X64: addb +; X64-NOT: movb +; X32-LABEL: add_8r +; X32-NOT: lock +; X32: addb +; X32-NOT: movb + %1 = load atomic i8, i8* %p seq_cst, align 1 + %2 = add i8 %1, %v + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @add_16i(i16* %p) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. -; X64-LABEL: add_16 +; X64-LABEL: add_16i ; X64-NOT: addw -; X32-LABEL: add_16 +; X32-LABEL: add_16i ; X32-NOT: addw %1 = load atomic i16, i16* %p acquire, align 2 %2 = add i16 %1, 2 @@ -122,12 +137,25 @@ ret void } -define void @add_32(i32* %p) { -; X64-LABEL: add_32 +define void @add_16r(i16* %p, i16 %v) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: add_16r +; X64-NOT: addw +; X32-LABEL: add_16r +; X32-NOT: addw [.*], ( + %1 = load atomic i16, i16* %p acquire, align 2 + %2 = add i16 %1, %v + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @add_32i(i32* %p) { +; X64-LABEL: add_32i ; X64-NOT: lock ; X64: addl ; X64-NOT: movl -; X32-LABEL: add_32 +; X32-LABEL: add_32i ; X32-NOT: lock ; X32: addl ; X32-NOT: movl @@ -137,23 +165,51 @@ ret void } -define void @add_64(i64* %p) { -; X64-LABEL: add_64 +define void @add_32r(i32* %p, i32 %v) { +; X64-LABEL: add_32r +; X64-NOT: lock +; X64: addl +; X64-NOT: movl +; X32-LABEL: add_32r +; X32-NOT: lock +; X32: addl +; X32-NOT: movl + %1 = load atomic i32, i32* %p acquire, align 4 + %2 = add i32 %1, %v + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +define void @add_64i(i64* %p) { +; X64-LABEL: add_64i ; X64-NOT: lock ; X64: addq ; X64-NOT: movq ; We do not check X86-32 as it cannot do 'addq'. -; X32-LABEL: add_64 +; X32-LABEL: add_64i %1 = load atomic i64, i64* %p acquire, align 8 %2 = add i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 ret void } -define void @add_32_seq_cst(i32* %p) { -; X64-LABEL: add_32_seq_cst +define void @add_64r(i64* %p, i64 %v) { +; X64-LABEL: add_64r +; X64-NOT: lock +; X64: addq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'addq'. +; X32-LABEL: add_64r + %1 = load atomic i64, i64* %p acquire, align 8 + %2 = add i64 %1, %v + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @add_32i_seq_cst(i32* %p) { +; X64-LABEL: add_32i_seq_cst ; X64: xchgl -; X32-LABEL: add_32_seq_cst +; X32-LABEL: add_32i_seq_cst ; X32: xchgl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = add i32 %1, 2 @@ -161,14 +217,25 @@ ret void } +define void @add_32r_seq_cst(i32* %p, i32 %v) { +; X64-LABEL: add_32r_seq_cst +; X64: xchgl +; X32-LABEL: add_32r_seq_cst +; X32: xchgl + %1 = load atomic i32, i32* %p monotonic, align 4 + %2 = add i32 %1, %v + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + ; ----- AND ----- -define void @and_8(i8* %p) { -; X64-LABEL: and_8 +define void @and_8i(i8* %p) { +; X64-LABEL: and_8i ; X64-NOT: lock ; X64: andb ; X64-NOT: movb -; X32-LABEL: and_8 +; X32-LABEL: and_8i ; X32-NOT: lock ; X32: andb ; X32-NOT: movb @@ -178,12 +245,27 @@ ret void } -define void @and_16(i16* %p) { +define void @and_8r(i8* %p, i8 %v) { +; X64-LABEL: and_8r +; X64-NOT: lock +; X64: andb +; X64-NOT: movb +; X32-LABEL: and_8r +; X32-NOT: lock +; X32: andb +; X32-NOT: movb + %1 = load atomic i8, i8* %p monotonic, align 1 + %2 = and i8 %1, %v + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @and_16i(i16* %p) { ; Currently the transformation is not done on 16 bit accesses, as the backend ; treat 16 bit arithmetic as expensive on X86/X86_64. -; X64-LABEL: and_16 +; X64-LABEL: and_16i ; X64-NOT: andw -; X32-LABEL: and_16 +; X32-LABEL: and_16i ; X32-NOT: andw %1 = load atomic i16, i16* %p acquire, align 2 %2 = and i16 %1, 2 @@ -191,12 +273,25 @@ ret void } -define void @and_32(i32* %p) { -; X64-LABEL: and_32 +define void @and_16r(i16* %p, i16 %v) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: and_16r +; X64-NOT: andw +; X32-LABEL: and_16r +; X32-NOT: andw [.*], ( + %1 = load atomic i16, i16* %p acquire, align 2 + %2 = and i16 %1, %v + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @and_32i(i32* %p) { +; X64-LABEL: and_32i ; X64-NOT: lock ; X64: andl ; X64-NOT: movl -; X32-LABEL: and_32 +; X32-LABEL: and_32i ; X32-NOT: lock ; X32: andl ; X32-NOT: movl @@ -206,23 +301,51 @@ ret void } -define void @and_64(i64* %p) { -; X64-LABEL: and_64 +define void @and_32r(i32* %p, i32 %v) { +; X64-LABEL: and_32r +; X64-NOT: lock +; X64: andl +; X64-NOT: movl +; X32-LABEL: and_32r +; X32-NOT: lock +; X32: andl +; X32-NOT: movl + %1 = load atomic i32, i32* %p acquire, align 4 + %2 = and i32 %1, %v + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @and_64i(i64* %p) { +; X64-LABEL: and_64i ; X64-NOT: lock ; X64: andq ; X64-NOT: movq ; We do not check X86-32 as it cannot do 'andq'. -; X32-LABEL: and_64 +; X32-LABEL: and_64i %1 = load atomic i64, i64* %p acquire, align 8 %2 = and i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 ret void } -define void @and_32_seq_cst(i32* %p) { -; X64-LABEL: and_32_seq_cst +define void @and_64r(i64* %p, i64 %v) { +; X64-LABEL: and_64r +; X64-NOT: lock +; X64: andq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'andq'. +; X32-LABEL: and_64r + %1 = load atomic i64, i64* %p acquire, align 8 + %2 = and i64 %1, %v + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @and_32i_seq_cst(i32* %p) { +; X64-LABEL: and_32i_seq_cst ; X64: xchgl -; X32-LABEL: and_32_seq_cst +; X32-LABEL: and_32i_seq_cst ; X32: xchgl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = and i32 %1, 2 @@ -230,14 +353,25 @@ ret void } +define void @and_32r_seq_cst(i32* %p, i32 %v) { +; X64-LABEL: and_32r_seq_cst +; X64: xchgl +; X32-LABEL: and_32r_seq_cst +; X32: xchgl + %1 = load atomic i32, i32* %p monotonic, align 4 + %2 = and i32 %1, %v + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + ; ----- OR ----- -define void @or_8(i8* %p) { -; X64-LABEL: or_8 +define void @or_8i(i8* %p) { +; X64-LABEL: or_8i ; X64-NOT: lock ; X64: orb ; X64-NOT: movb -; X32-LABEL: or_8 +; X32-LABEL: or_8i ; X32-NOT: lock ; X32: orb ; X32-NOT: movb @@ -247,10 +381,25 @@ ret void } -define void @or_16(i16* %p) { -; X64-LABEL: or_16 +define void @or_8r(i8* %p, i8 %v) { +; X64-LABEL: or_8r +; X64-NOT: lock +; X64: orb +; X64-NOT: movb +; X32-LABEL: or_8r +; X32-NOT: lock +; X32: orb +; X32-NOT: movb + %1 = load atomic i8, i8* %p acquire, align 1 + %2 = or i8 %1, %v + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @or_16i(i16* %p) { +; X64-LABEL: or_16i ; X64-NOT: orw -; X32-LABEL: or_16 +; X32-LABEL: or_16i ; X32-NOT: orw %1 = load atomic i16, i16* %p acquire, align 2 %2 = or i16 %1, 2 @@ -258,12 +407,23 @@ ret void } -define void @or_32(i32* %p) { -; X64-LABEL: or_32 +define void @or_16r(i16* %p, i16 %v) { +; X64-LABEL: or_16r +; X64-NOT: orw +; X32-LABEL: or_16r +; X32-NOT: orw [.*], ( + %1 = load atomic i16, i16* %p acquire, align 2 + %2 = or i16 %1, %v + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @or_32i(i32* %p) { +; X64-LABEL: or_32i ; X64-NOT: lock ; X64: orl ; X64-NOT: movl -; X32-LABEL: or_32 +; X32-LABEL: or_32i ; X32-NOT: lock ; X32: orl ; X32-NOT: movl @@ -273,23 +433,51 @@ ret void } -define void @or_64(i64* %p) { -; X64-LABEL: or_64 +define void @or_32r(i32* %p, i32 %v) { +; X64-LABEL: or_32r +; X64-NOT: lock +; X64: orl +; X64-NOT: movl +; X32-LABEL: or_32r +; X32-NOT: lock +; X32: orl +; X32-NOT: movl + %1 = load atomic i32, i32* %p acquire, align 4 + %2 = or i32 %1, %v + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @or_64i(i64* %p) { +; X64-LABEL: or_64i ; X64-NOT: lock ; X64: orq ; X64-NOT: movq ; We do not check X86-32 as it cannot do 'orq'. -; X32-LABEL: or_64 +; X32-LABEL: or_64i %1 = load atomic i64, i64* %p acquire, align 8 %2 = or i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 ret void } -define void @or_32_seq_cst(i32* %p) { -; X64-LABEL: or_32_seq_cst +define void @or_64r(i64* %p, i64 %v) { +; X64-LABEL: or_64r +; X64-NOT: lock +; X64: orq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'orq'. +; X32-LABEL: or_64r + %1 = load atomic i64, i64* %p acquire, align 8 + %2 = or i64 %1, %v + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @or_32i_seq_cst(i32* %p) { +; X64-LABEL: or_32i_seq_cst ; X64: xchgl -; X32-LABEL: or_32_seq_cst +; X32-LABEL: or_32i_seq_cst ; X32: xchgl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = or i32 %1, 2 @@ -297,14 +485,25 @@ ret void } +define void @or_32r_seq_cst(i32* %p, i32 %v) { +; X64-LABEL: or_32r_seq_cst +; X64: xchgl +; X32-LABEL: or_32r_seq_cst +; X32: xchgl + %1 = load atomic i32, i32* %p monotonic, align 4 + %2 = or i32 %1, %v + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + ; ----- XOR ----- -define void @xor_8(i8* %p) { -; X64-LABEL: xor_8 +define void @xor_8i(i8* %p) { +; X64-LABEL: xor_8i ; X64-NOT: lock ; X64: xorb ; X64-NOT: movb -; X32-LABEL: xor_8 +; X32-LABEL: xor_8i ; X32-NOT: lock ; X32: xorb ; X32-NOT: movb @@ -314,10 +513,25 @@ ret void } -define void @xor_16(i16* %p) { -; X64-LABEL: xor_16 +define void @xor_8r(i8* %p, i8 %v) { +; X64-LABEL: xor_8r +; X64-NOT: lock +; X64: xorb +; X64-NOT: movb +; X32-LABEL: xor_8r +; X32-NOT: lock +; X32: xorb +; X32-NOT: movb + %1 = load atomic i8, i8* %p acquire, align 1 + %2 = xor i8 %1, %v + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @xor_16i(i16* %p) { +; X64-LABEL: xor_16i ; X64-NOT: xorw -; X32-LABEL: xor_16 +; X32-LABEL: xor_16i ; X32-NOT: xorw %1 = load atomic i16, i16* %p acquire, align 2 %2 = xor i16 %1, 2 @@ -325,12 +539,23 @@ ret void } -define void @xor_32(i32* %p) { -; X64-LABEL: xor_32 +define void @xor_16r(i16* %p, i16 %v) { +; X64-LABEL: xor_16r +; X64-NOT: xorw +; X32-LABEL: xor_16r +; X32-NOT: xorw [.*], ( + %1 = load atomic i16, i16* %p acquire, align 2 + %2 = xor i16 %1, %v + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @xor_32i(i32* %p) { +; X64-LABEL: xor_32i ; X64-NOT: lock ; X64: xorl ; X64-NOT: movl -; X32-LABEL: xor_32 +; X32-LABEL: xor_32i ; X32-NOT: lock ; X32: xorl ; X32-NOT: movl @@ -340,23 +565,51 @@ ret void } -define void @xor_64(i64* %p) { -; X64-LABEL: xor_64 +define void @xor_32r(i32* %p, i32 %v) { +; X64-LABEL: xor_32r +; X64-NOT: lock +; X64: xorl +; X64-NOT: movl +; X32-LABEL: xor_32r +; X32-NOT: lock +; X32: xorl +; X32-NOT: movl + %1 = load atomic i32, i32* %p acquire, align 4 + %2 = xor i32 %1, %v + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @xor_64i(i64* %p) { +; X64-LABEL: xor_64i ; X64-NOT: lock ; X64: xorq ; X64-NOT: movq ; We do not check X86-32 as it cannot do 'xorq'. -; X32-LABEL: xor_64 +; X32-LABEL: xor_64i %1 = load atomic i64, i64* %p acquire, align 8 %2 = xor i64 %1, 2 store atomic i64 %2, i64* %p release, align 8 ret void } -define void @xor_32_seq_cst(i32* %p) { -; X64-LABEL: xor_32_seq_cst +define void @xor_64r(i64* %p, i64 %v) { +; X64-LABEL: xor_64r +; X64-NOT: lock +; X64: xorq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'xorq'. +; X32-LABEL: xor_64r + %1 = load atomic i64, i64* %p acquire, align 8 + %2 = xor i64 %1, %v + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @xor_32i_seq_cst(i32* %p) { +; X64-LABEL: xor_32i_seq_cst ; X64: xchgl -; X32-LABEL: xor_32_seq_cst +; X32-LABEL: xor_32i_seq_cst ; X32: xchgl %1 = load atomic i32, i32* %p monotonic, align 4 %2 = xor i32 %1, 2 @@ -364,6 +617,17 @@ ret void } +define void @xor_32r_seq_cst(i32* %p, i32 %v) { +; X64-LABEL: xor_32r_seq_cst +; X64: xchgl +; X32-LABEL: xor_32r_seq_cst +; X32: xchgl + %1 = load atomic i32, i32* %p monotonic, align 4 + %2 = xor i32 %1, %v + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + ; ----- INC ----- define void @inc_8(i8* %p) {