Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -241,6 +241,12 @@ def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; +def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt", + "HasPOPCNTFalseDeps", "true", + "POPCNT has a false dependency on dest register">; +def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt", + "HasLZCNTFalseDeps", "true", + "LZCNT/TZCNT have a false dependency on dest register">; // On some X86 processors, there is no performance hazard to writing only the // lower parts of a YMM or ZMM register without clearing the upper part. def FeatureFastPartialYMMorZMMWrite @@ -539,7 +545,8 @@ class SandyBridgeProc : ProcModel; def : SandyBridgeProc<"sandybridge">; def : SandyBridgeProc<"corei7-avx">; // Legacy alias. @@ -552,7 +559,8 @@ class IvyBridgeProc : ProcModel; def : IvyBridgeProc<"ivybridge">; def : IvyBridgeProc<"core-avx-i">; // Legacy alias. @@ -569,7 +577,9 @@ class HaswellProc : ProcModel; def : HaswellProc<"haswell">; def : HaswellProc<"core-avx2">; // Legacy alias. @@ -580,7 +590,9 @@ ]>; class BroadwellProc : ProcModel; def : BroadwellProc<"broadwell">; @@ -596,7 +608,8 @@ class SkylakeClientProc : ProcModel; def : SkylakeClientProc<"skylake">; Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -7969,7 +7969,8 @@ /// /// FIXME: This should be turned into a TSFlags. /// -static bool hasPartialRegUpdate(unsigned Opcode) { +static bool hasPartialRegUpdate(unsigned Opcode, + const X86Subtarget &Subtarget) { switch (Opcode) { case X86::CVTSI2SSrr: case X86::CVTSI2SSrm: @@ -8008,6 +8009,21 @@ case X86::SQRTSDr_Int: case X86::SQRTSDm_Int: return true; + // GPR + case X86::POPCNT32rm: + case X86::POPCNT32rr: + case X86::POPCNT64rm: + case X86::POPCNT64rr: + return Subtarget.hasPOPCNTFalseDeps(); + case X86::LZCNT32rm: + case X86::LZCNT32rr: + case X86::LZCNT64rm: + case X86::LZCNT64rr: + case X86::TZCNT32rm: + case X86::TZCNT32rr: + case X86::TZCNT64rm: + case X86::TZCNT64rr: + return Subtarget.hasLZCNTFalseDeps(); } return false; @@ -8018,7 +8034,7 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance( const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { - if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode())) + if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget)) return 0; // If MI is marked as reading Reg, the partial register update is wanted. @@ -8220,6 +8236,20 @@ .addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); MI.addRegisterKilled(Reg, TRI, true); + } else if (X86::GR64RegClass.contains(Reg)) { + // Using XOR32rr because it has shorter encoding and zeros up the upper bits + // as well. + unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg) + .addReg(XReg, RegState::Undef) + .addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + MI.addRegisterKilled(Reg, TRI, true); + } else if (X86::GR32RegClass.contains(Reg)) { + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + MI.addRegisterKilled(Reg, TRI, true); } } @@ -8391,7 +8421,8 @@ // Avoid partial register update stalls unless optimizing for size. // TODO: we should block undef reg update as well. - if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) + if (!MF.getFunction()->optForSize() && + hasPartialRegUpdate(MI.getOpcode(), Subtarget)) return nullptr; unsigned NumOps = MI.getDesc().getNumOperands(); @@ -8560,7 +8591,8 @@ // Unless optimizing for size, don't fold to avoid partial // register update stalls // TODO: we should block undef reg update as well. - if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) + if (!MF.getFunction()->optForSize() && + hasPartialRegUpdate(MI.getOpcode(), Subtarget)) return nullptr; // Don't fold subreg spills, or reloads that use a high subreg. @@ -8759,7 +8791,8 @@ // Avoid partial register update stalls unless optimizing for size. // TODO: we should block undef reg update as well. - if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) + if (!MF.getFunction()->optForSize() && + hasPartialRegUpdate(MI.getOpcode(), Subtarget)) return nullptr; // Determine the alignment of the load. Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -218,6 +218,12 @@ /// the stack pointer. This is an optimization for Intel Atom processors. bool UseLeaForSP; + /// True if POPCNT instruction has a false dependency on the destination register. + bool HasPOPCNTFalseDeps; + + /// True if LZCNT/TZCNT instructions have a false dependency on the destination register. + bool HasLZCNTFalseDeps; + /// True if there is no performance penalty to writing only the lower parts /// of a YMM or ZMM register without clearing the upper part. bool HasFastPartialYMMorZMMWrite; @@ -495,6 +501,8 @@ bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } + bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; } + bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; } bool hasFastPartialYMMorZMMWrite() const { return HasFastPartialYMMorZMMWrite; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -343,6 +343,8 @@ HasSSEUnalignedMem = false; HasCmpxchg16b = false; UseLeaForSP = false; + HasPOPCNTFalseDeps = false; + HasLZCNTFalseDeps = false; HasFastPartialYMMorZMMWrite = false; HasFastScalarFSQRT = false; HasFastVectorFSQRT = false; Index: test/CodeGen/X86/bitcnt-false-dep.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/bitcnt-false-dep.ll @@ -0,0 +1,171 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell -mattr=+lzcnt | FileCheck %s --check-prefix=HSW +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -mattr=+lzcnt | FileCheck %s --check-prefix=SKL + +; This tests a fix for bugzilla 33869 https://bugs.llvm.org/show_bug.cgi?id=33869 + +declare i32 @llvm.ctpop.i32(i32) +declare i64 @llvm.ctpop.i64(i64) +declare i64 @llvm.ctlz.i64(i64, i1) +declare i32 @llvm.cttz.i32(i32, i1) +declare i64 @llvm.cttz.i64(i64, i1) +declare i32 @llvm.ctlz.i32(i32, i1) + +define i32 @loopdep_popcnt32(i32* nocapture %x, double* nocapture %y) nounwind { +entry: + %vx = load i32, i32* %x + br label %loop +loop: + %i = phi i32 [ 1, %entry ], [ %inc, %loop ] + %s1 = phi i32 [ %vx, %entry ], [ %s2, %loop ] + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %j = tail call i32 @llvm.ctpop.i32(i32 %i) + %s2 = add i32 %s1, %j + %inc = add nsw i32 %i, 1 + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %exitcond = icmp eq i32 %inc, 156250000 + br i1 %exitcond, label %ret, label %loop +ret: + ret i32 %s2 + +;HSW-LABEL:@loopdep_popcnt32 +;HSW: xorl [[GPR0:%e[a-d]x]], [[GPR0]] +;HSW-NEXT: popcntl {{.*}}, [[GPR0]] + +;SKL-LABEL:@loopdep_popcnt32 +;SKL: xorl [[GPR0:%e[a-d]x]], [[GPR0]] +;SKL-NEXT: popcntl {{.*}}, [[GPR0]] +} + +define i64 @loopdep_popcnt64(i64* nocapture %x, double* nocapture %y) nounwind { +entry: + %vx = load i64, i64* %x + br label %loop +loop: + %i = phi i64 [ 1, %entry ], [ %inc, %loop ] + %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %j = tail call i64 @llvm.ctpop.i64(i64 %i) + %s2 = add i64 %s1, %j + %inc = add nsw i64 %i, 1 + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %exitcond = icmp eq i64 %inc, 156250000 + br i1 %exitcond, label %ret, label %loop +ret: + ret i64 %s2 + +;HSW-LABEL:@loopdep_popcnt64 +;HSW: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]] +;HSW-NEXT: popcntq {{.*}}, %r[[GPR0]] + +;SKL-LABEL:@loopdep_popcnt64 +;SKL: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]] +;SKL-NEXT: popcntq {{.*}}, %r[[GPR0]] +} + +define i32 @loopdep_tzct32(i32* nocapture %x, double* nocapture %y) nounwind { +entry: + %vx = load i32, i32* %x + br label %loop +loop: + %i = phi i32 [ 1, %entry ], [ %inc, %loop ] + %s1 = phi i32 [ %vx, %entry ], [ %s2, %loop ] + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %j = call i32 @llvm.cttz.i32(i32 %i, i1 true) + %s2 = add i32 %s1, %j + %inc = add nsw i32 %i, 1 + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %exitcond = icmp eq i32 %inc, 156250000 + br i1 %exitcond, label %ret, label %loop +ret: + ret i32 %s2 + +;HSW-LABEL:@loopdep_tzct32 +;HSW: xorl [[GPR0:%e[a-d]x]], [[GPR0]] +;HSW-NEXT: tzcntl {{.*}}, [[GPR0]] + +; This false dependecy issue was fixed in Skylake +;SKL-LABEL:@loopdep_tzct32 +;SKL-NOT: xor +;SKL: tzcntl +} + +define i64 @loopdep_tzct64(i64* nocapture %x, double* nocapture %y) nounwind { +entry: + %vx = load i64, i64* %x + br label %loop +loop: + %i = phi i64 [ 1, %entry ], [ %inc, %loop ] + %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %j = tail call i64 @llvm.cttz.i64(i64 %i, i1 true) + %s2 = add i64 %s1, %j + %inc = add nsw i64 %i, 1 + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %exitcond = icmp eq i64 %inc, 156250000 + br i1 %exitcond, label %ret, label %loop +ret: + ret i64 %s2 + +;HSW-LABEL:@loopdep_tzct64 +;HSW: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]] +;HSW-NEXT: tzcntq {{.*}}, %r[[GPR0]] + +; This false dependecy issue was fixed in Skylake +;SKL-LABEL:@loopdep_tzct64 +;SKL-NOT: xor +;SKL: tzcntq +} + +define i32 @loopdep_lzct32(i32* nocapture %x, double* nocapture %y) nounwind { +entry: + %vx = load i32, i32* %x + br label %loop +loop: + %i = phi i32 [ 1, %entry ], [ %inc, %loop ] + %s1 = phi i32 [ %vx, %entry ], [ %s2, %loop ] + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %j = call i32 @llvm.ctlz.i32(i32 %i, i1 true) + %s2 = add i32 %s1, %j + %inc = add nsw i32 %i, 1 + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %exitcond = icmp eq i32 %inc, 156250000 + br i1 %exitcond, label %ret, label %loop +ret: + ret i32 %s2 + +;HSW-LABEL:@loopdep_lzct32 +;HSW: xorl [[GPR0:%e[a-d]x]], [[GPR0]] +;HSW-NEXT: lzcntl {{.*}}, [[GPR0]] + +; This false dependecy issue was fixed in Skylake +;SKL-LABEL:@loopdep_lzct32 +;SKL-NOT: xor +;SKL: lzcntl +} + +define i64 @loopdep_lzct64(i64* nocapture %x, double* nocapture %y) nounwind { +entry: + %vx = load i64, i64* %x + br label %loop +loop: + %i = phi i64 [ 1, %entry ], [ %inc, %loop ] + %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %j = tail call i64 @llvm.ctlz.i64(i64 %i, i1 true) + %s2 = add i64 %s1, %j + %inc = add nsw i64 %i, 1 + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + %exitcond = icmp eq i64 %inc, 156250000 + br i1 %exitcond, label %ret, label %loop +ret: + ret i64 %s2 + +;HSW-LABEL:@loopdep_lzct64 +;HSW: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]] +;HSW-NEXT: lzcntq {{.*}}, %r[[GPR0]] + +; This false dependecy issue was fixed in Skylake +;SKL-LABEL:@loopdep_lzct64 +;SKL-NOT: xor +;SKL: lzcntq +}