Index: ../llvm/lib/Target/X86/CMakeLists.txt =================================================================== --- ../llvm/lib/Target/X86/CMakeLists.txt +++ ../llvm/lib/Target/X86/CMakeLists.txt @@ -25,6 +25,7 @@ X86MCInstLower.cpp X86MachineFunctionInfo.cpp X86PadShortFunction.cpp + X86PopcntOpt.cpp X86RegisterInfo.cpp X86SelectionDAGInfo.cpp X86ShuffleDecodeConstantPool.cpp Index: ../llvm/lib/Target/X86/X86.h =================================================================== --- ../llvm/lib/Target/X86/X86.h +++ ../llvm/lib/Target/X86/X86.h @@ -45,6 +45,10 @@ /// transition penalty between functions encoded with AVX and SSE. FunctionPass *createX86IssueVZeroUpperPass(); +/// Return a pass that insret xor before popcnt to remove +/// false dependency in popcnt dest register +FunctionPass *createX86PopcntOptPass(); + /// Return a pass that pads short functions with NOOPs. /// This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); Index: ../llvm/lib/Target/X86/X86PopcntOpt.cpp =================================================================== --- ../llvm/lib/Target/X86/X86PopcntOpt.cpp +++ ../llvm/lib/Target/X86/X86PopcntOpt.cpp @@ -0,0 +1,93 @@ +//===-- X86PopcntOpt.cpp - ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which inserts x86 xor instructions +// before calls to popcnt instruction. +// Sandy/Ivy Bridge and Haswell processors have false dependency in popcnt +// instruction on its destination register. +// The WA is to insert xor before the popcnt so it will remove the dependency. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-popcnt-opt" + +STATISTIC(NumXOR, "Number of XOR instructions inserted"); + +namespace { +class PopcntOptInserter : public MachineFunctionPass { +public: + PopcntOptInserter() : MachineFunctionPass(ID) {} + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override { return "X86 Popcnt optimization"; } + +private: + void insertXor(MachineBasicBlock::iterator I, MachineBasicBlock &MBB, + unsigned Xor); + bool EverMadeChange; + const TargetInstrInfo *TII; + static char ID; +}; +char PopcntOptInserter::ID = 0; +} + +FunctionPass *llvm::createX86PopcntOptPass() { return new PopcntOptInserter(); } + +void PopcntOptInserter::insertXor(MachineBasicBlock::iterator I, + MachineBasicBlock &MBB, unsigned Xor) { + DebugLoc dl = I->getDebugLoc(); + // in case srcReg == destReg, there is no need to insert xor + if (I->getOperand(0).getReg() == I->getOperand(1).getReg()) + return; + BuildMI(MBB, I, dl, TII->get(Xor), I->getOperand(0).getReg()) + .addReg(I->getOperand(0).getReg()) + .addReg(I->getOperand(0).getReg()); + ++NumXOR; + EverMadeChange = true; +} + +/// runOnMachineFunction - Loop over all of the basic blocks, inserting +/// xor instructions before popcnt. +bool PopcntOptInserter::runOnMachineFunction(MachineFunction &MF) { + const X86Subtarget &ST = MF.getSubtarget(); + if (!ST.hasAVX() || !ST.hasPOPCNT()) + return false; + TII = ST.getInstrInfo(); + EverMadeChange = false; + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + for (MachineBasicBlock::iterator MBBI = I->begin(), MBBE = I->end(); + MBBI != MBBE;) { + MachineInstr *MI = MBBI++; + switch (MI->getOpcode()) { + case X86::POPCNT16rr: + case X86::POPCNT16rm: + insertXor(MI, *I, X86::XOR16rr); + break; + case X86::POPCNT32rr: + case X86::POPCNT32rm: + insertXor(MI, *I, X86::XOR32rr); + break; + case X86::POPCNT64rr: + case X86::POPCNT64rm: + insertXor(MI, *I, X86::XOR64rr); + break; + } + } + } + return EverMadeChange; +} Index: ../llvm/lib/Target/X86/X86TargetMachine.cpp =================================================================== --- ../llvm/lib/Target/X86/X86TargetMachine.cpp +++ ../llvm/lib/Target/X86/X86TargetMachine.cpp @@ -275,6 +275,11 @@ if (getOptLevel() != CodeGenOpt::None) addPass(createExecutionDependencyFixPass(&X86::VR128RegClass)); + // the pass should be called post DCE pass + // and post RA pass + if (getOptLevel() != CodeGenOpt::None) + addPass(createX86PopcntOptPass()); + if (UseVZeroUpper) addPass(createX86IssueVZeroUpperPass()); Index: ../llvm/test/CodeGen/X86/popcnt.ll =================================================================== --- ../llvm/test/CodeGen/X86/popcnt.ll +++ ../llvm/test/CodeGen/X86/popcnt.ll @@ -1,38 +1,81 @@ ; RUN: llc -march=x86-64 -mattr=+popcnt < %s | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck --check-prefix=ALL %s +; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck --check-prefix=ALL %s +; RUN: llc < %s -march=x86-64 -mcpu=haswell | FileCheck --check-prefix=ALL %s +; RUN: llc < %s -march=x86-64 -mcpu=knl | FileCheck --check-prefix=ALL --check-prefix=POPCNT %s define i8 @cnt8(i8 %x) nounwind readnone { %cnt = tail call i8 @llvm.ctpop.i8(i8 %x) ret i8 %cnt ; CHECK-LABEL: cnt8: +; CHECK-NOT: xorw ; CHECK: popcntw ; CHECK: ret + +; ALL-LABEL: cnt8: +; ALL: popcntw +; ALL: ret + } define i16 @cnt16(i16 %x) nounwind readnone { %cnt = tail call i16 @llvm.ctpop.i16(i16 %x) ret i16 %cnt ; CHECK-LABEL: cnt16: +; CHECK-NOT: xorw ; CHECK: popcntw ; CHECK: ret + +; ALL-LABEL: cnt16: +; ALL: xorw +; ALL-NEXT: popcntw +; ALL: ret } define i32 @cnt32(i32 %x) nounwind readnone { %cnt = tail call i32 @llvm.ctpop.i32(i32 %x) ret i32 %cnt ; CHECK-LABEL: cnt32: +; CHECK-NOT: xorl ; CHECK: popcntl ; CHECK: ret +; ALL-LABEL: cnt32: +; ALL: xorl +; ALL-NEXT: popcntl +; ALL: ret } define i64 @cnt64(i64 %x) nounwind readnone { %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) ret i64 %cnt ; CHECK-LABEL: cnt64: +; CHECK-NOT: xorq ; CHECK: popcntq ; CHECK: ret + +; ALL-LABEL: cnt64: +; ALL: xorq +; ALL-NEXT: popcntq +; ALL: ret } +define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { +; test case for destReg=srcReg +; insert xor is illegal +; POPCNT-LABEL: testv16i32: +; POPCNT: # BB#0: +; POPCNT-NEXT: vmovdqa32 (%rcx), %zmm0 +; POPCNT-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; POPCNT-NEXT: vpextrd $1, %xmm1, %eax +; POPCNT-NEXT: popcntl %eax, %eax +; POPCNT-NEXT: vmovd %xmm1, %ecx +; POPCNT-NEXT: popcntl %ecx, %ecx +; POPCNT-NEXT: vmovd %ecx, %xmm2 + %out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in) + ret <16 x i32> %out +} declare i8 @llvm.ctpop.i8(i8) nounwind readnone declare i16 @llvm.ctpop.i16(i16) nounwind readnone declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i64 @llvm.ctpop.i64(i64) nounwind readnone +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>)