Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
@@ -53,7 +53,8 @@
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIPreAllocateWWMRegsPass();
 FunctionPass *createSIFormMemoryClausesPass();
-FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &);
+FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
+                                               const TargetMachine *);
 FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPULibCalls.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPU.h"
 #include "AMDGPULibFunc.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/ADT/StringSet.h"
@@ -22,6 +23,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
@@ -29,6 +31,7 @@
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <vector>
 #include <cmath>
@@ -65,6 +68,8 @@
 
   typedef llvm::AMDGPULibFunc FuncInfo;
 
+  const TargetMachine *TM;
+
   // -fuse-native.
   bool AllNative = false;
 
@@ -134,6 +139,9 @@
   // __read_pipe/__write_pipe
   bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo);
 
+  // llvm.amdgcn.wavefrontsize
+  bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);
+
   // Get insertion point at entry.
   BasicBlock::iterator getEntryIns(CallInst * UI);
   // Insert an Alloc instruction.
@@ -152,6 +160,8 @@
   }
 
 public:
+  AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {}
+
   bool fold(CallInst *CI, AliasAnalysis *AA = nullptr);
 
   void initNativeFuncs();
@@ -166,15 +176,16 @@
 
   class AMDGPUSimplifyLibCalls : public FunctionPass {
 
-  AMDGPULibCalls Simplifier;
-
   const TargetOptions Options;
 
+  AMDGPULibCalls Simplifier;
+
   public:
     static char ID; // Pass identification
 
-    AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions())
-      : FunctionPass(ID), Options(Opt) {
+    AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(),
+                           const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), Options(Opt), Simplifier(TM) {
       initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
 
@@ -639,14 +650,6 @@
   // Ignore indirect calls.
   if (Callee == 0) return false;
 
-  FuncInfo FInfo;
-  if (!parseFunctionName(Callee->getName(), &FInfo))
-    return false;
-
-  // Further check the number of arguments to see if they match.
-  if (CI->getNumArgOperands() != FInfo.getNumArgs())
-    return false;
-
   BasicBlock *BB = CI->getParent();
   LLVMContext &Context = CI->getParent()->getContext();
   IRBuilder<> B(Context);
@@ -658,6 +661,21 @@
   if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
     B.setFastMathFlags(FPOp->getFastMathFlags());
 
+  switch (Callee->getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::amdgcn_wavefrontsize:
+    return !EnablePreLink && fold_wavefrontsize(CI, B);
+  }
+
+  FuncInfo FInfo;
+  if (!parseFunctionName(Callee->getName(), &FInfo))
+    return false;
+
+  // Further check the number of arguments to see if they match.
+  if (CI->getNumArgOperands() != FInfo.getNumArgs())
+    return false;
+
   if (TDOFold(CI, FInfo))
     return true;
 
@@ -1371,6 +1389,29 @@
   return true;
 }
 
+bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
+  if (!TM)
+    return false;
+
+  StringRef CPU = TM->getTargetCPU();
+  StringRef Features = TM->getTargetFeatureString();
+  if ((CPU.empty() || CPU.equals_lower("generic")) &&
+      (Features.empty() ||
+       Features.find_lower("wavefrontsize") == StringRef::npos))
+    return false;
+
+  Function *F = CI->getParent()->getParent();
+  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F);
+  unsigned N = ST.getWavefrontSize();
+
+  LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with "
+               << N << "\n");
+
+  CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N));
+  CI->eraseFromParent();
+  return true;
+}
+
 // Get insertion point at entry.
 BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
   Function * Func = UI->getParent()->getParent();
@@ -1680,8 +1721,9 @@
 }
 
 // Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) {
-  return new AMDGPUSimplifyLibCalls(Opt);
+FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt,
+                                                     const TargetMachine *TM) {
+  return new AMDGPUSimplifyLibCalls(Opt, TM);
 }
 
 FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -432,7 +432,7 @@
       PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
       PM.add(llvm::createAMDGPUUseNativeCallsPass());
       if (LibCallSimplify)
-        PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
+        PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
   });
 
   Builder.addExtension(
Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
+
+; RUN: opt -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize32 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
+
+; GCN-LABEL: {{^}}fold_wavefrontsize:
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+
+; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 32
+; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 64
+; GCN:       store_dword v[{{[0-9:]+}}], [[V]]
+
+; OPT-W32:   store i32 32, i32 addrspace(1)* %arg, align 4
+; OPT-W64:   store i32 64, i32 addrspace(1)* %arg, align 4
+; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT-WXX:   store i32 %tmp, i32 addrspace(1)* %arg, align 4
+; OPT-NEXT:  ret void
+
+define amdgpu_kernel void @fold_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  store i32 %tmp, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+
+; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
+; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
+; GCN-NOT:   cndmask
+; GCN:       store_dword v[{{[0-9:]+}}], [[V]]
+
+; OPT-W32:   store i32 1, i32 addrspace(1)* %arg, align 4
+; OPT-W64:   store i32 2, i32 addrspace(1)* %arg, align 4
+; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT-WXX:   %tmp1 = icmp ugt i32 %tmp, 32
+; OPT-WXX:   %tmp2 = select i1 %tmp1, i32 2, i32 1
+; OPT-WXX:   store i32 %tmp2, i32 addrspace(1)* %arg
+; OPT-NEXT:  ret void
+
+define amdgpu_kernel void @fold_and_optimize_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  %tmp2 = select i1 %tmp1, i32 2, i32 1
+  store i32 %tmp2, i32 addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+
+; OPT:       bb:
+; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT-WXX:   %tmp1 = icmp ugt i32 %tmp, 32
+; OPT-WXX:   bb3:
+; OPT-W64:   store i32 1, i32 addrspace(1)* %arg, align 4
+; OPT-NEXT:  ret void
+
+define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  br i1 %tmp1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  ret void
+}
+
+declare i32 @llvm.amdgcn.wavefrontsize() #0
+
+attributes #0 = { nounwind readnone speculatable }