Diff 130852

llvm/trunk/lib/Target/X86/X86.td

Show First 20 Lines • Show All 262 Lines • ▼ Show 20 Lines	def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
"LEA instruction with certain arguments is slow">;		"LEA instruction with certain arguments is slow">;
def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",		def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
"LEA instruction with 3 ops or certain registers is slow">;		"LEA instruction with 3 ops or certain registers is slow">;
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",		def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
"INC and DEC instructions are slower than ADD and SUB">;		"INC and DEC instructions are slower than ADD and SUB">;
def FeatureSoftFloat		def FeatureSoftFloat
: SubtargetFeature<"soft-float", "UseSoftFloat", "true",		: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
"Use software floating point features.">;		"Use software floating point features.">;
		def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
		"HasPOPCNTFalseDeps", "true",
		"POPCNT has a false dependency on dest register">;
		def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
		"HasLZCNTFalseDeps", "true",
		"LZCNT/TZCNT have a false dependency on dest register">;
// On recent X86 (port bound) processors, its preferable to combine to a single shuffle		// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
// using a variable mask over multiple fixed shuffles.		// using a variable mask over multiple fixed shuffles.
def FeatureFastVariableShuffle		def FeatureFastVariableShuffle
: SubtargetFeature<"fast-variable-shuffle",		: SubtargetFeature<"fast-variable-shuffle",
"HasFastVariableShuffle",		"HasFastVariableShuffle",
"true", "Shuffles with variable masks are fast">;		"true", "Shuffles with variable masks are fast">;
// On some X86 processors, there is no performance hazard to writing only the		// On some X86 processors, there is no performance hazard to writing only the
// lower parts of a YMM or ZMM register without clearing the upper part.		// lower parts of a YMM or ZMM register without clearing the upper part.
▲ Show 20 Lines • Show All 335 Lines • ▼ Show 20 Lines	def SNBFeatures : ProcessorFeatures<[], [
FeatureFastScalarFSQRT,		FeatureFastScalarFSQRT,
FeatureFastSHLDRotate,		FeatureFastSHLDRotate,
FeatureSlowIncDec,		FeatureSlowIncDec,
FeatureMacroFusion		FeatureMacroFusion
]>;		]>;

class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,		class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
SNBFeatures.Value, [		SNBFeatures.Value, [
FeatureSlowUAMem32		FeatureSlowUAMem32,
		FeaturePOPCNTFalseDeps
]>;		]>;
def : SandyBridgeProc<"sandybridge">;		def : SandyBridgeProc<"sandybridge">;
def : SandyBridgeProc<"corei7-avx">; // Legacy alias.		def : SandyBridgeProc<"corei7-avx">; // Legacy alias.

def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [		def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [
FeatureRDRAND,		FeatureRDRAND,
FeatureF16C,		FeatureF16C,
FeatureFSGSBase		FeatureFSGSBase
]>;		]>;

class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,		class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
IVBFeatures.Value, [		IVBFeatures.Value, [
FeatureSlowUAMem32		FeatureSlowUAMem32,
		FeaturePOPCNTFalseDeps
]>;		]>;
def : IvyBridgeProc<"ivybridge">;		def : IvyBridgeProc<"ivybridge">;
def : IvyBridgeProc<"core-avx-i">; // Legacy alias.		def : IvyBridgeProc<"core-avx-i">; // Legacy alias.

def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [		def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
FeatureAVX2,		FeatureAVX2,
FeatureBMI,		FeatureBMI,
FeatureBMI2,		FeatureBMI2,
FeatureERMSB,		FeatureERMSB,
FeatureFMA,		FeatureFMA,
FeatureLZCNT,		FeatureLZCNT,
FeatureMOVBE,		FeatureMOVBE,
FeatureFastVariableShuffle		FeatureFastVariableShuffle
]>;		]>;

class HaswellProc<string Name> : ProcModel<Name, HaswellModel,		class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
HSWFeatures.Value, [		HSWFeatures.Value, [
ProcIntelHSW		ProcIntelHSW,
		FeaturePOPCNTFalseDeps,
		FeatureLZCNTFalseDeps
]>;		]>;
def : HaswellProc<"haswell">;		def : HaswellProc<"haswell">;
def : HaswellProc<"core-avx2">; // Legacy alias.		def : HaswellProc<"core-avx2">; // Legacy alias.

def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [		def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
FeatureADX,		FeatureADX,
FeatureRDSEED,		FeatureRDSEED,
FeaturePRFCHW		FeaturePRFCHW
]>;		]>;
class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,		class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
BDWFeatures.Value, [		BDWFeatures.Value, [
ProcIntelBDW		ProcIntelBDW,
		FeaturePOPCNTFalseDeps,
		FeatureLZCNTFalseDeps
]>;		]>;
def : BroadwellProc<"broadwell">;		def : BroadwellProc<"broadwell">;

def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [		def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
FeatureMPX,		FeatureMPX,
FeatureRTM,		FeatureRTM,
FeatureXSAVEC,		FeatureXSAVEC,
FeatureXSAVES,		FeatureXSAVES,
FeatureSGX,		FeatureSGX,
FeatureCLFLUSHOPT,		FeatureCLFLUSHOPT,
FeatureFastVectorFSQRT		FeatureFastVectorFSQRT
]>;		]>;

class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,		class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
SKLFeatures.Value, [		SKLFeatures.Value, [
ProcIntelSKL,		ProcIntelSKL,
FeatureHasFastGather		FeatureHasFastGather,
		FeaturePOPCNTFalseDeps
]>;		]>;
def : SkylakeClientProc<"skylake">;		def : SkylakeClientProc<"skylake">;

def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [		def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [
FeatureAVX512,		FeatureAVX512,
FeatureERI,		FeatureERI,
FeatureCDI,		FeatureCDI,
FeaturePFI,		FeaturePFI,
▲ Show 20 Lines • Show All 386 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86InstrInfo.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,055 Lines • ▼ Show 20 Lines
/// movss (%rdi), %xmm0		/// movss (%rdi), %xmm0
/// cvtss2sd %xmm0, %xmm0		/// cvtss2sd %xmm0, %xmm0
///		///
/// Instead of		/// Instead of
/// cvtss2sd (%rdi), %xmm0		/// cvtss2sd (%rdi), %xmm0
///		///
/// FIXME: This should be turned into a TSFlags.		/// FIXME: This should be turned into a TSFlags.
///		///
static bool hasPartialRegUpdate(unsigned Opcode) {		static bool hasPartialRegUpdate(unsigned Opcode,
		const X86Subtarget &Subtarget) {
switch (Opcode) {		switch (Opcode) {
case X86::CVTSI2SSrr:		case X86::CVTSI2SSrr:
case X86::CVTSI2SSrm:		case X86::CVTSI2SSrm:
case X86::CVTSI642SSrr:		case X86::CVTSI642SSrr:
case X86::CVTSI642SSrm:		case X86::CVTSI642SSrm:
case X86::CVTSI2SDrr:		case X86::CVTSI2SDrr:
case X86::CVTSI2SDrm:		case X86::CVTSI2SDrm:
case X86::CVTSI642SDrr:		case X86::CVTSI642SDrr:
Show All 22 Lines	static bool hasPartialRegUpdate(unsigned Opcode,
case X86::SQRTSSm:		case X86::SQRTSSm:
case X86::SQRTSSr_Int:		case X86::SQRTSSr_Int:
case X86::SQRTSSm_Int:		case X86::SQRTSSm_Int:
case X86::SQRTSDr:		case X86::SQRTSDr:
case X86::SQRTSDm:		case X86::SQRTSDm:
case X86::SQRTSDr_Int:		case X86::SQRTSDr_Int:
case X86::SQRTSDm_Int:		case X86::SQRTSDm_Int:
return true;		return true;
		// GPR
		case X86::POPCNT32rm:
		case X86::POPCNT32rr:
		case X86::POPCNT64rm:
		case X86::POPCNT64rr:
		return Subtarget.hasPOPCNTFalseDeps();
		case X86::LZCNT32rm:
		case X86::LZCNT32rr:
		case X86::LZCNT64rm:
		case X86::LZCNT64rr:
		case X86::TZCNT32rm:
		case X86::TZCNT32rr:
		case X86::TZCNT64rm:
		case X86::TZCNT64rr:
		return Subtarget.hasLZCNTFalseDeps();
}		}

return false;		return false;
}		}

/// Inform the BreakFalseDeps pass how many idle		/// Inform the BreakFalseDeps pass how many idle
/// instructions we would like before a partial register update.		/// instructions we would like before a partial register update.
unsigned X86InstrInfo::getPartialRegUpdateClearance(		unsigned X86InstrInfo::getPartialRegUpdateClearance(
const MachineInstr &MI, unsigned OpNum,		const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {		const TargetRegisterInfo *TRI) const {
if (OpNum != 0 \|\| !hasPartialRegUpdate(MI.getOpcode()))		if (OpNum != 0 \|\| !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
return 0;		return 0;

// If MI is marked as reading Reg, the partial register update is wanted.		// If MI is marked as reading Reg, the partial register update is wanted.
const MachineOperand &MO = MI.getOperand(0);		const MachineOperand &MO = MI.getOperand(0);
unsigned Reg = MO.getReg();		unsigned Reg = MO.getReg();
if (TargetRegisterInfo::isVirtualRegister(Reg)) {		if (TargetRegisterInfo::isVirtualRegister(Reg)) {
if (MO.readsReg() \|\| MI.readsVirtualRegister(Reg))		if (MO.readsReg() \|\| MI.readsVirtualRegister(Reg))
return 0;		return 0;
▲ Show 20 Lines • Show All 189 Lines • ▼ Show 20 Lines	if (X86::VR128RegClass.contains(Reg)) {
// Use vxorps to clear the full ymm register.		// Use vxorps to clear the full ymm register.
// It wants to read and write the xmm sub-register.		// It wants to read and write the xmm sub-register.
unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);		unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)		BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
.addReg(XReg, RegState::Undef)		.addReg(XReg, RegState::Undef)
.addReg(XReg, RegState::Undef)		.addReg(XReg, RegState::Undef)
.addReg(Reg, RegState::ImplicitDefine);		.addReg(Reg, RegState::ImplicitDefine);
MI.addRegisterKilled(Reg, TRI, true);		MI.addRegisterKilled(Reg, TRI, true);
		} else if (X86::GR64RegClass.contains(Reg)) {
		// Using XOR32rr because it has shorter encoding and zeros up the upper bits
		// as well.
		unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit);
		BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
		.addReg(XReg, RegState::Undef)
		.addReg(XReg, RegState::Undef)
		.addReg(Reg, RegState::ImplicitDefine);
		MI.addRegisterKilled(Reg, TRI, true);
		} else if (X86::GR32RegClass.contains(Reg)) {
		BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
		.addReg(Reg, RegState::Undef)
		.addReg(Reg, RegState::Undef);
		MI.addRegisterKilled(Reg, TRI, true);
}		}
}		}

static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,		static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
int PtrOffset = 0) {		int PtrOffset = 0) {
unsigned NumAddrOps = MOs.size();		unsigned NumAddrOps = MOs.size();

if (NumAddrOps < 4) {		if (NumAddrOps < 4) {
▲ Show 20 Lines • Show All 155 Lines • ▼ Show 20 Lines	MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
if (isSlowTwoMemOps && !MF.getFunction().optForMinSize() &&		if (isSlowTwoMemOps && !MF.getFunction().optForMinSize() &&
(MI.getOpcode() == X86::CALL32r \|\| MI.getOpcode() == X86::CALL64r \|\|		(MI.getOpcode() == X86::CALL32r \|\| MI.getOpcode() == X86::CALL64r \|\|
MI.getOpcode() == X86::PUSH16r \|\| MI.getOpcode() == X86::PUSH32r \|\|		MI.getOpcode() == X86::PUSH16r \|\| MI.getOpcode() == X86::PUSH32r \|\|
MI.getOpcode() == X86::PUSH64r))		MI.getOpcode() == X86::PUSH64r))
return nullptr;		return nullptr;

// Avoid partial register update stalls unless optimizing for size.		// Avoid partial register update stalls unless optimizing for size.
// TODO: we should block undef reg update as well.		// TODO: we should block undef reg update as well.
if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))		if (!MF.getFunction().optForSize() &&
		hasPartialRegUpdate(MI.getOpcode(), Subtarget))
return nullptr;		return nullptr;

unsigned NumOps = MI.getDesc().getNumOperands();		unsigned NumOps = MI.getDesc().getNumOperands();
bool isTwoAddr =		bool isTwoAddr =
NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;		NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;

// FIXME: AsmPrinter doesn't know how to handle		// FIXME: AsmPrinter doesn't know how to handle
// X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.		// X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
▲ Show 20 Lines • Show All 152 Lines • ▼ Show 20 Lines	X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
int FrameIndex, LiveIntervals *LIS) const {		int FrameIndex, LiveIntervals *LIS) const {
// Check switch flag		// Check switch flag
if (NoFusing)		if (NoFusing)
return nullptr;		return nullptr;

// Unless optimizing for size, don't fold to avoid partial		// Unless optimizing for size, don't fold to avoid partial
// register update stalls		// register update stalls
// TODO: we should block undef reg update as well.		// TODO: we should block undef reg update as well.
if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))		if (!MF.getFunction().optForSize() &&
		hasPartialRegUpdate(MI.getOpcode(), Subtarget))
return nullptr;		return nullptr;

// Don't fold subreg spills, or reloads that use a high subreg.		// Don't fold subreg spills, or reloads that use a high subreg.
for (auto Op : Ops) {		for (auto Op : Ops) {
MachineOperand &MO = MI.getOperand(Op);		MachineOperand &MO = MI.getOperand(Op);
auto SubReg = MO.getSubReg();		auto SubReg = MO.getSubReg();
if (SubReg && (MO.isDef() \|\| SubReg == X86::sub_8bit_hi))		if (SubReg && (MO.isDef() \|\| SubReg == X86::sub_8bit_hi))
return nullptr;		return nullptr;
▲ Show 20 Lines • Show All 182 Lines • ▼ Show 20 Lines	if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);		return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
}		}

// Check switch flag		// Check switch flag
if (NoFusing) return nullptr;		if (NoFusing) return nullptr;

// Avoid partial register update stalls unless optimizing for size.		// Avoid partial register update stalls unless optimizing for size.
// TODO: we should block undef reg update as well.		// TODO: we should block undef reg update as well.
if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))		if (!MF.getFunction().optForSize() &&
		hasPartialRegUpdate(MI.getOpcode(), Subtarget))
return nullptr;		return nullptr;

// Determine the alignment of the load.		// Determine the alignment of the load.
unsigned Alignment = 0;		unsigned Alignment = 0;
if (LoadMI.hasOneMemOperand())		if (LoadMI.hasOneMemOperand())
Alignment = (*LoadMI.memoperands_begin())->getAlignment();		Alignment = (*LoadMI.memoperands_begin())->getAlignment();
else		else
switch (LoadMI.getOpcode()) {		switch (LoadMI.getOpcode()) {
▲ Show 20 Lines • Show All 2,357 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86Subtarget.h

Show First 20 Lines • Show All 226 Lines • ▼ Show 20 Lines	protected:
/// True if this processor has the CMPXCHG16B instruction;		/// True if this processor has the CMPXCHG16B instruction;
/// this is true for most x86-64 chips, but not the first AMD chips.		/// this is true for most x86-64 chips, but not the first AMD chips.
bool HasCmpxchg16b;		bool HasCmpxchg16b;

/// True if the LEA instruction should be used for adjusting		/// True if the LEA instruction should be used for adjusting
/// the stack pointer. This is an optimization for Intel Atom processors.		/// the stack pointer. This is an optimization for Intel Atom processors.
bool UseLeaForSP;		bool UseLeaForSP;

		/// True if POPCNT instruction has a false dependency on the destination register.
		bool HasPOPCNTFalseDeps;

		/// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
		bool HasLZCNTFalseDeps;

/// True if its preferable to combine to a single shuffle using a variable		/// True if its preferable to combine to a single shuffle using a variable
/// mask over multiple fixed shuffles.		/// mask over multiple fixed shuffles.
bool HasFastVariableShuffle;		bool HasFastVariableShuffle;

/// True if there is no performance penalty to writing only the lower parts		/// True if there is no performance penalty to writing only the lower parts
/// of a YMM or ZMM register without clearing the upper part.		/// of a YMM or ZMM register without clearing the upper part.
bool HasFastPartialYMMorZMMWrite;		bool HasFastPartialYMMorZMMWrite;

▲ Show 20 Lines • Show All 309 Lines • ▼ Show 20 Lines	public:
bool isPMULLDSlow() const { return IsPMULLDSlow; }		bool isPMULLDSlow() const { return IsPMULLDSlow; }
bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }		bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }		bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
int getGatherOverhead() const { return GatherOverhead; }		int getGatherOverhead() const { return GatherOverhead; }
int getScatterOverhead() const { return ScatterOverhead; }		int getScatterOverhead() const { return ScatterOverhead; }
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }		bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }		bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }		bool useLeaForSP() const { return UseLeaForSP; }
		bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
		bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
bool hasFastVariableShuffle() const {		bool hasFastVariableShuffle() const {
return HasFastVariableShuffle;		return HasFastVariableShuffle;
}		}
bool hasFastPartialYMMorZMMWrite() const {		bool hasFastPartialYMMorZMMWrite() const {
return HasFastPartialYMMorZMMWrite;		return HasFastPartialYMMorZMMWrite;
}		}
bool hasFastGather() const { return HasFastGather; }		bool hasFastGather() const { return HasFastGather; }
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }		bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
▲ Show 20 Lines • Show All 187 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86Subtarget.cpp

Show First 20 Lines • Show All 323 Lines • ▼ Show 20 Lines	void X86Subtarget::initializeEnvironment() {
HasRDPID = false;		HasRDPID = false;
IsPMULLDSlow = false;		IsPMULLDSlow = false;
IsSHLDSlow = false;		IsSHLDSlow = false;
IsUAMem16Slow = false;		IsUAMem16Slow = false;
IsUAMem32Slow = false;		IsUAMem32Slow = false;
HasSSEUnalignedMem = false;		HasSSEUnalignedMem = false;
HasCmpxchg16b = false;		HasCmpxchg16b = false;
UseLeaForSP = false;		UseLeaForSP = false;
		HasPOPCNTFalseDeps = false;
		HasLZCNTFalseDeps = false;
HasFastVariableShuffle = false;		HasFastVariableShuffle = false;
HasFastPartialYMMorZMMWrite = false;		HasFastPartialYMMorZMMWrite = false;
HasFastGather = false;		HasFastGather = false;
HasFastScalarFSQRT = false;		HasFastScalarFSQRT = false;
HasFastVectorFSQRT = false;		HasFastVectorFSQRT = false;
HasFastLZCNT = false;		HasFastLZCNT = false;
HasFastSHLDRotate = false;		HasFastSHLDRotate = false;
HasMacroFusion = false;		HasMacroFusion = false;
▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/bitcnt-false-dep.ll

				; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell -mattr=+lzcnt \| FileCheck %s --check-prefix=HSW
				; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -mattr=+lzcnt \| FileCheck %s --check-prefix=SKL

				; This tests a fix for bugzilla 33869 https://bugs.llvm.org/show_bug.cgi?id=33869

				declare i32 @llvm.ctpop.i32(i32)
				declare i64 @llvm.ctpop.i64(i64)
				declare i64 @llvm.ctlz.i64(i64, i1)
				declare i32 @llvm.cttz.i32(i32, i1)
				declare i64 @llvm.cttz.i64(i64, i1)
				declare i32 @llvm.ctlz.i32(i32, i1)

				define i32 @loopdep_popcnt32(i32* nocapture %x, double* nocapture %y) nounwind {
				entry:
				%vx = load i32, i32* %x
				br label %loop
				loop:
				%i = phi i32 [ 1, %entry ], [ %inc, %loop ]
				%s1 = phi i32 [ %vx, %entry ], [ %s2, %loop ]
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%j = tail call i32 @llvm.ctpop.i32(i32 %i)
				%s2 = add i32 %s1, %j
				%inc = add nsw i32 %i, 1
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%exitcond = icmp eq i32 %inc, 156250000
				br i1 %exitcond, label %ret, label %loop
				ret:
				ret i32 %s2

				;HSW-LABEL:@loopdep_popcnt32
				;HSW: xorl [[GPR0:%e[a-d]x]], [[GPR0]]
				;HSW-NEXT: popcntl {{.*}}, [[GPR0]]

				;SKL-LABEL:@loopdep_popcnt32
				;SKL: xorl [[GPR0:%e[a-d]x]], [[GPR0]]
				;SKL-NEXT: popcntl {{.*}}, [[GPR0]]
				}

				define i64 @loopdep_popcnt64(i64* nocapture %x, double* nocapture %y) nounwind {
				entry:
				%vx = load i64, i64* %x
				br label %loop
				loop:
				%i = phi i64 [ 1, %entry ], [ %inc, %loop ]
				%s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%j = tail call i64 @llvm.ctpop.i64(i64 %i)
				%s2 = add i64 %s1, %j
				%inc = add nsw i64 %i, 1
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%exitcond = icmp eq i64 %inc, 156250000
				br i1 %exitcond, label %ret, label %loop
				ret:
				ret i64 %s2

				;HSW-LABEL:@loopdep_popcnt64
				;HSW: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]]
				;HSW-NEXT: popcntq {{.*}}, %r[[GPR0]]

				;SKL-LABEL:@loopdep_popcnt64
				;SKL: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]]
				;SKL-NEXT: popcntq {{.*}}, %r[[GPR0]]
				}

				define i32 @loopdep_tzct32(i32* nocapture %x, double* nocapture %y) nounwind {
				entry:
				%vx = load i32, i32* %x
				br label %loop
				loop:
				%i = phi i32 [ 1, %entry ], [ %inc, %loop ]
				%s1 = phi i32 [ %vx, %entry ], [ %s2, %loop ]
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%j = call i32 @llvm.cttz.i32(i32 %i, i1 true)
				%s2 = add i32 %s1, %j
				%inc = add nsw i32 %i, 1
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%exitcond = icmp eq i32 %inc, 156250000
				br i1 %exitcond, label %ret, label %loop
				ret:
				ret i32 %s2

				;HSW-LABEL:@loopdep_tzct32
				;HSW: xorl [[GPR0:%e[a-d]x]], [[GPR0]]
				;HSW-NEXT: tzcntl {{.*}}, [[GPR0]]

				; This false dependecy issue was fixed in Skylake
				;SKL-LABEL:@loopdep_tzct32
				;SKL-NOT: xor
				;SKL: tzcntl
				}

				define i64 @loopdep_tzct64(i64* nocapture %x, double* nocapture %y) nounwind {
				entry:
				%vx = load i64, i64* %x
				br label %loop
				loop:
				%i = phi i64 [ 1, %entry ], [ %inc, %loop ]
				%s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%j = tail call i64 @llvm.cttz.i64(i64 %i, i1 true)
				%s2 = add i64 %s1, %j
				%inc = add nsw i64 %i, 1
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%exitcond = icmp eq i64 %inc, 156250000
				br i1 %exitcond, label %ret, label %loop
				ret:
				ret i64 %s2

				;HSW-LABEL:@loopdep_tzct64
				;HSW: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]]
				;HSW-NEXT: tzcntq {{.*}}, %r[[GPR0]]

				; This false dependecy issue was fixed in Skylake
				;SKL-LABEL:@loopdep_tzct64
				;SKL-NOT: xor
				;SKL: tzcntq
				}

				define i32 @loopdep_lzct32(i32* nocapture %x, double* nocapture %y) nounwind {
				entry:
				%vx = load i32, i32* %x
				br label %loop
				loop:
				%i = phi i32 [ 1, %entry ], [ %inc, %loop ]
				%s1 = phi i32 [ %vx, %entry ], [ %s2, %loop ]
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%j = call i32 @llvm.ctlz.i32(i32 %i, i1 true)
				%s2 = add i32 %s1, %j
				%inc = add nsw i32 %i, 1
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%exitcond = icmp eq i32 %inc, 156250000
				br i1 %exitcond, label %ret, label %loop
				ret:
				ret i32 %s2

				;HSW-LABEL:@loopdep_lzct32
				;HSW: xorl [[GPR0:%e[a-d]x]], [[GPR0]]
				;HSW-NEXT: lzcntl {{.*}}, [[GPR0]]

				; This false dependecy issue was fixed in Skylake
				;SKL-LABEL:@loopdep_lzct32
				;SKL-NOT: xor
				;SKL: lzcntl
				}

				define i64 @loopdep_lzct64(i64* nocapture %x, double* nocapture %y) nounwind {
				entry:
				%vx = load i64, i64* %x
				br label %loop
				loop:
				%i = phi i64 [ 1, %entry ], [ %inc, %loop ]
				%s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%j = tail call i64 @llvm.ctlz.i64(i64 %i, i1 true)
				%s2 = add i64 %s1, %j
				%inc = add nsw i64 %i, 1
				tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
				%exitcond = icmp eq i64 %inc, 156250000
				br i1 %exitcond, label %ret, label %loop
				ret:
				ret i64 %s2

				;HSW-LABEL:@loopdep_lzct64
				;HSW: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]]
				;HSW-NEXT: lzcntq {{.*}}, %r[[GPR0]]

				; This false dependecy issue was fixed in Skylake
				;SKL-LABEL:@loopdep_lzct64
				;SKL-NOT: xor
				;SKL: lzcntq
				}

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Break false dependencies for POPCNT, LZCNT, TZCNT
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 130852

llvm/trunk/lib/Target/X86/X86.td

llvm/trunk/lib/Target/X86/X86InstrInfo.cpp

llvm/trunk/lib/Target/X86/X86Subtarget.h

llvm/trunk/lib/Target/X86/X86Subtarget.cpp

llvm/trunk/test/CodeGen/X86/bitcnt-false-dep.ll

This is an archive of the discontinued LLVM Phabricator instance.

[X86] Break false dependencies for POPCNT, LZCNT, TZCNTClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 130852

llvm/trunk/lib/Target/X86/X86.td

llvm/trunk/lib/Target/X86/X86InstrInfo.cpp

llvm/trunk/lib/Target/X86/X86Subtarget.h

llvm/trunk/lib/Target/X86/X86Subtarget.cpp

llvm/trunk/test/CodeGen/X86/bitcnt-false-dep.ll

[X86] Break false dependencies for POPCNT, LZCNT, TZCNT
ClosedPublic