Index: docs/LangRef.html
===================================================================
--- docs/LangRef.html
+++ docs/LangRef.html
@@ -329,6 +329,8 @@
           <li><a href="#int_objectsize">
             '<tt>llvm.objectsize</tt>' Intrinsic</a></li>
           <li><a href="#int_expect">
+            '<tt>llvm.assume.aligned</tt>' Intrinsic</a></li>
+          <li><a href="#int_assume_aligned">
             '<tt>llvm.expect</tt>' Intrinsic</a></li>
           <li><a href="#int_donothing">
             '<tt>llvm.donothing</tt>' Intrinsic</a></li>
@@ -9105,6 +9107,49 @@
 
 <!-- _______________________________________________________________________ -->
 <h4>
+  <a name="int_assume_aligned">'<tt>llvm.assume.aligned</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<p>This is an overloaded intrinsic. You can use <tt>llvm.assume.aligned</tt> on any
+   integer bit width for the offset and for different address spaces.</p>
+
+<pre>
+  declare i8* @llvm.assume.aligned.p0i8.i64(i8* &lt;addr&gt;, i32 &lt;align&gt;, i64 &lt;offset&gt;)
+</pre>
+
+<h5>Overview:</h5>
+<p>The '<tt>llvm.assume.aligned.*</tt>' intrinsics allow the optimizer to assume
+   that the address returned (which references the same memory location as the
+   address provided) is the given number of bytes offset from an address with the
+   alignment specified.</p>
+
+<h5>Arguments:</h5>
+
+<p>The first argument is a pointer about which an alignment assumption is to be made.
+   This pointer value is also the return value of the intrinsic, and the assumption
+   applies to uses of this return value.</p>
+
+<p>The second argument is the alignment.</p>
+
+<p>The third argument is the offset from the provided address to the
+   address with the specified alignment. For example, if an alignment of 32 is
+   specified, and an offset of 8 is specified, then the return value minus 8 bytes
+   is assumed to be 32-byte aligned.</p>
+
+<h5>Semantics:</h5>
+
+<p>The optimizer may use the assumption specified to override the alignments
+   specified on loads and stores, memory intrinsics, and other alignment-sensitive
+   instructions which use the returned address. This applies not only to direct
+   uses of the return address, but also to uses of addresses derived from the
+   returned address.</p>
+</div>
+
+<!-- _______________________________________________________________________ -->
+<h4>
   <a name="int_donothing">'<tt>llvm.donothing</tt>' Intrinsic</a>
 </h4>
 
Index: include/llvm-c/Transforms/Scalar.h
===================================================================
--- include/llvm-c/Transforms/Scalar.h
+++ include/llvm-c/Transforms/Scalar.h
@@ -35,6 +35,9 @@
 /** See llvm::createAggressiveDCEPass function. */
 void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM);
 
+/** See llvm::createAssumeAlignPass function. */
+void LLVMAddAssumeAlignPass(LLVMPassManagerRef PM);
+
 /** See llvm::createCFGSimplificationPass function. */
 void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM);
 
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -66,6 +66,7 @@
 void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlwaysInlinerPass(PassRegistry&);
 void initializeArgPromotionPass(PassRegistry&);
+void initializeAssumeAlignPass(PassRegistry&);
 void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAliasAnalysisPass(PassRegistry&);
 void initializeBasicCallGraphPass(PassRegistry&);
Index: include/llvm/Intrinsics.td
===================================================================
--- include/llvm/Intrinsics.td
+++ include/llvm/Intrinsics.td
@@ -230,6 +230,9 @@
                                   [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty,
                                    llvm_i32_ty],
                                   [IntrReadWriteArgMem, NoCapture<0>]>;
+def int_assume_aligned : Intrinsic<[llvm_anyptr_ty],
+                                  [LLVMMatchType<0>, llvm_i32_ty,
+                                   llvm_anyint_ty], [IntrNoMem]>;
 def int_pcmarker      : Intrinsic<[], [llvm_i32_ty]>;
 
 def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>;
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -50,6 +50,7 @@
       (void) llvm::createAliasAnalysisCounterPass();
       (void) llvm::createAliasDebugger();
       (void) llvm::createArgumentPromotionPass();
+      (void) llvm::createAssumeAlignPass();
       (void) llvm::createBasicAliasAnalysisPass();
       (void) llvm::createLibCallAliasAnalysisPass(0);
       (void) llvm::createScalarEvolutionAliasAnalysisPass();
Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -32,6 +32,12 @@
 
 //===----------------------------------------------------------------------===//
 //
+// AssumeAlign - A worklist driven alignment assumption propagation pass
+//
+FunctionPass *createAssumeAlignPass();
+
+//===----------------------------------------------------------------------===//
+//
 // SCCP - Sparse conditional constant propagation.
 //
 FunctionPass *createSCCPPass();
Index: lib/Analysis/BasicAliasAnalysis.cpp
===================================================================
--- lib/Analysis/BasicAliasAnalysis.cpp
+++ lib/Analysis/BasicAliasAnalysis.cpp
@@ -217,6 +217,21 @@
   return V;
 }
 
+static const Value *getAssumeAlignedPtr(const Value *V) {
+  const CallInst *CI = dyn_cast<CallInst>(V);
+  if (!CI)
+    return 0;
+
+  const Function *F = CI->getCalledFunction();
+  if (!F)
+    return 0;
+
+  if (F->getIntrinsicID() == Intrinsic::assume_aligned)
+    return CI->getArgOperand(0);
+
+  return 0;
+}
+
 /// DecomposeGEPExpression - If V is a symbolic pointer expression, decompose it
 /// into a base pointer with a constant offset and a number of scaled symbolic
 /// offsets.
@@ -239,6 +254,11 @@
   
   BaseOffs = 0;
   do {
+    if (const Value *V2 = getAssumeAlignedPtr(V)) {
+      V = V2;
+      continue;
+    }
+
     // See if this is a bitcast or GEP.
     const Operator *Op = dyn_cast<Operator>(V);
     if (Op == 0) {
@@ -1158,6 +1178,21 @@
   return Alias;
 }
 
+static const Value *stripPointerCasts(const Value *V) {
+  bool Done;
+  do {
+    V = V->stripPointerCasts();
+    if (const Value *V2 = getAssumeAlignedPtr(V)) {
+      V = V2;
+      Done = false;
+    } else {
+      Done = true;
+    }
+  } while (!Done);
+
+  return V;
+}
+
 // aliasCheck - Provide a bunch of ad-hoc rules to disambiguate in common cases,
 // such as array references.
 //
@@ -1172,8 +1207,8 @@
     return NoAlias;
 
   // Strip off any casts if they exist.
-  V1 = V1->stripPointerCasts();
-  V2 = V2->stripPointerCasts();
+  V1 = stripPointerCasts(V1);
+  V2 = stripPointerCasts(V2);
 
   // Are we checking for alias of the same value?
   if (V1 == V2) return MustAlias;
Index: lib/Analysis/CaptureTracking.cpp
===================================================================
--- lib/Analysis/CaptureTracking.cpp
+++ lib/Analysis/CaptureTracking.cpp
@@ -16,6 +16,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/CaptureTracking.h"
@@ -108,34 +111,31 @@
       if (CS.onlyReadsMemory() && CS.doesNotThrow() && I->getType()->isVoidTy())
         break;
 
-      // Not captured if only passed via 'nocapture' arguments.  Note that
-      // calling a function pointer does not in itself cause the pointer to
-      // be captured.  This is a subtle point considering that (for example)
-      // the callee might return its own address.  It is analogous to saying
-      // that loading a value from a pointer does not cause the pointer to be
-      // captured, even though the loaded value might be the pointer itself
-      // (think of self-referential objects).
-      CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
-      for (CallSite::arg_iterator A = B; A != E; ++A)
-        if (A->get() == V && !CS.doesNotCapture(A - B))
-          // The parameter is not marked 'nocapture' - captured.
-          if (Tracker->captured(U))
-            return;
-      break;
+      bool IsAssumeAligned = false;
+      if (Function *F = CS.getCalledFunction())
+        if (F->getIntrinsicID() == Intrinsic::assume_aligned)
+          IsAssumeAligned = true;
+
+      if (!IsAssumeAligned) {
+        // Not captured if only passed via 'nocapture' arguments.  Note that
+        // calling a function pointer does not in itself cause the pointer to
+        // be captured.  This is a subtle point considering that (for example)
+        // the callee might return its own address.  It is analogous to saying
+        // that loading a value from a pointer does not cause the pointer to be
+        // captured, even though the loaded value might be the pointer itself
+        // (think of self-referential objects).
+        CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
+        for (CallSite::arg_iterator A = B; A != E; ++A)
+          if (A->get() == V && !CS.doesNotCapture(A - B))
+            // The parameter is not marked 'nocapture' - captured.
+            if (Tracker->captured(U))
+              return;
+
+        break;
+      }
+
+      // fallthrough; treat assume_aligned like a bitcast
     }
-    case Instruction::Load:
-      // Loading from a pointer does not cause it to be captured.
-      break;
-    case Instruction::VAArg:
-      // "va-arg" from a pointer does not cause it to be captured.
-      break;
-    case Instruction::Store:
-      if (V == I->getOperand(0))
-        // Stored the pointer - conservatively assume it may be captured.
-        if (Tracker->captured(U))
-          return;
-      // Storing to the pointee does not cause the pointer to be captured.
-      break;
     case Instruction::BitCast:
     case Instruction::GetElementPtr:
     case Instruction::PHI:
@@ -149,6 +149,19 @@
             Worklist.push_back(U);
       }
       break;
+    case Instruction::Load:
+      // Loading from a pointer does not cause it to be captured.
+      break;
+    case Instruction::VAArg:
+      // "va-arg" from a pointer does not cause it to be captured.
+      break;
+    case Instruction::Store:
+      if (V == I->getOperand(0))
+        // Stored the pointer - conservatively assume it may be captured.
+        if (Tracker->captured(U))
+          return;
+      // Storing to the pointee does not cause the pointer to be captured.
+      break;
     case Instruction::ICmp:
       // Don't count comparisons of a no-alias return value against null as
       // captures. This allows us to ignore comparisons of malloc results
Index: lib/Analysis/CodeMetrics.cpp
===================================================================
--- lib/Analysis/CodeMetrics.cpp
+++ lib/Analysis/CodeMetrics.cpp
@@ -67,6 +67,7 @@
     switch (II->getIntrinsicID()) {
     default:
       return false;
+    case Intrinsic::assume_aligned:
     case Intrinsic::dbg_declare:
     case Intrinsic::dbg_value:
     case Intrinsic::invariant_start:
Index: lib/Analysis/InstructionSimplify.cpp
===================================================================
--- lib/Analysis/InstructionSimplify.cpp
+++ lib/Analysis/InstructionSimplify.cpp
@@ -19,6 +19,9 @@
 
 #define DEBUG_TYPE "instsimplify"
 #include "llvm/GlobalAlias.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/Operator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/SetVector.h"
@@ -690,6 +693,21 @@
   return true;
 }
 
+static Value *getAssumeAlignedPtr(Value *V) {
+  CallInst *CI = dyn_cast<CallInst>(V);
+  if (!CI)
+    return 0;
+
+  Function *F = CI->getCalledFunction();
+  if (!F)
+    return 0;
+
+  if (F->getIntrinsicID() == Intrinsic::assume_aligned)
+    return CI->getArgOperand(0);
+
+  return 0;
+}
+
 /// \brief Compute the base pointer and cumulative constant offsets for V.
 ///
 /// This strips all constant offsets off of V, leaving it the base pointer, and
@@ -715,6 +733,8 @@
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
+    } else if (Value *V2 = getAssumeAlignedPtr(V)) {
+      V = V2;
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (GA->mayBeOverridden())
         break;
Index: lib/Analysis/Loads.cpp
===================================================================
--- lib/Analysis/Loads.cpp
+++ lib/Analysis/Loads.cpp
@@ -48,6 +48,21 @@
   return false;
 }
 
+static Value *getAssumeAlignedPtr(Value *V) {
+  CallInst *CI = dyn_cast<CallInst>(V);
+  if (!CI)
+    return 0;
+
+  Function *F = CI->getCalledFunction();
+  if (!F)
+    return 0;
+
+  if (F->getIntrinsicID() == Intrinsic::assume_aligned)
+    return CI->getArgOperand(0);
+
+  return 0;
+}
+
 /// getUnderlyingObjectWithOffset - Strip off up to MaxLookup GEPs and
 /// bitcasts to get back to the underlying object being addressed, keeping
 /// track of the offset in bytes from the GEPs relative to the result.
@@ -68,6 +83,8 @@
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
+    } else if (Value *V2 = getAssumeAlignedPtr(V)) {
+      V = V2;
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (GA->mayBeOverridden())
         return V;
Index: lib/Analysis/ScalarEvolution.cpp
===================================================================
--- lib/Analysis/ScalarEvolution.cpp
+++ lib/Analysis/ScalarEvolution.cpp
@@ -62,9 +62,12 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/GlobalAlias.h"
 #include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Operator.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -3808,6 +3811,16 @@
   case Instruction::SExt:
     return getSignExtendExpr(getSCEV(U->getOperand(0)), U->getType());
 
+  case Instruction::Call:
+    if (isSCEVable(V->getType())) {
+      // calls to assume_align are treated like no-op casts.
+      CallInst *CI = cast<CallInst>(V);
+      Function *F = CI->getCalledFunction();
+      if (F && F->getIntrinsicID() == Intrinsic::assume_aligned)
+        return getSCEV(CI->getArgOperand(0));
+    }
+
+    break;
   case Instruction::BitCast:
     // BitCasts are no-op casts so we just eliminate the cast.
     if (isSCEVable(U->getType()) && isSCEVable(U->getOperand(0)->getType()))
Index: lib/Analysis/ValueTracking.cpp
===================================================================
--- lib/Analysis/ValueTracking.cpp
+++ lib/Analysis/ValueTracking.cpp
@@ -1585,6 +1585,21 @@
   return 0;
 }
 
+static Value *getAssumeAlignedPtr(Value *V) {
+  CallInst *CI = dyn_cast<CallInst>(V);
+  if (!CI)
+    return 0;
+
+  Function *F = CI->getCalledFunction();
+  if (!F)
+    return 0;
+
+  if (F->getIntrinsicID() == Intrinsic::assume_aligned)
+    return CI->getArgOperand(0);
+
+  return 0;
+}
+
 /// GetPointerBaseWithConstantOffset - Analyze the specified pointer to see if
 /// it can be expressed as a base pointer plus a constant offset.  Return the
 /// base and offset to the caller.
@@ -1597,6 +1612,9 @@
   // Just look through bitcasts.
   if (PtrOp->getOpcode() == Instruction::BitCast)
     return GetPointerBaseWithConstantOffset(PtrOp->getOperand(0), Offset, TD);
+
+  if (Value *P2 = getAssumeAlignedPtr(Ptr))
+    return GetPointerBaseWithConstantOffset(P2, Offset, TD);
   
   // If this is a GEP with constant indices, we can look through it.
   GEPOperator *GEP = dyn_cast<GEPOperator>(PtrOp);
@@ -1787,6 +1805,8 @@
       if (GA->mayBeOverridden())
         return V;
       V = GA->getAliasee();
+    } else if (Value *VP = getAssumeAlignedPtr(V)) {
+      V = VP;
     } else {
       // See if InstructionSimplify knows any relevant tricks.
       if (Instruction *I = dyn_cast<Instruction>(V))
@@ -1896,6 +1916,7 @@
        // information about their operands.
        // FIXME: There are other no-op synthetic instructions that potentially
        // should be considered at least *safe* to speculate...
+       case Intrinsic::assume_aligned:
        case Intrinsic::dbg_declare:
        case Intrinsic::dbg_value:
          return true;
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5060,6 +5060,7 @@
     setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, dl, MVT::i32));
     return 0;
 
+  case Intrinsic::assume_aligned:
   case Intrinsic::expect: {
     // Just replace __builtin_expect(exp, c) with EXP.
     setValue(&I, getValue(I.getArgOperand(0)));
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -199,6 +199,8 @@
     MPM.add(createLoopUnrollPass());          // Unroll small loops
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
 
+  MPM.add(createAssumeAlignPass());           // Alignment assumptions
+
   if (OptLevel > 1)
     MPM.add(createGVNPass());                 // Remove redundancies
   MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
Index: lib/Transforms/Scalar/AssumeAlign.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Scalar/AssumeAlign.cpp
@@ -0,0 +1,303 @@
+//===- AssumeAlign.cpp - Code to perform Simple Constant Propagation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements alignment assumption propagation.
+//
+//===----------------------------------------------------------------------===//
+
+#define AA_NAME "assumealign"
+#define DEBUG_TYPE AA_NAME
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Constant.h"
+#include "llvm/Instruction.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/PostOrderIterator.h"
+using namespace llvm;
+
+STATISTIC(NumLoadAlignChanged,
+  "Number of loads changed by alignment assumptions");
+STATISTIC(NumStoreAlignChanged,
+  "Number of stores changed by alignment assumptions");
+STATISTIC(NumMemIntAlignChanged,
+  "Number of memory intrinsics changed by alignment assumptions");
+
+namespace {
+  struct AssumeAlign : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    AssumeAlign() : FunctionPass(ID) {
+      initializeAssumeAlignPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<ScalarEvolution>();
+    }
+  };
+}
+
+char AssumeAlign::ID = 0;
+static const char assumealign_name[] = "Alignment assumption propagation";
+INITIALIZE_PASS_BEGIN(AssumeAlign, AA_NAME,
+                assumealign_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(AssumeAlign, AA_NAME,
+                assumealign_name, false, false)
+
+FunctionPass *llvm::createAssumeAlignPass() {
+  return new AssumeAlign();
+}
+
+static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
+                                    const SCEV *AlignSCEV,
+                                    ScalarEvolution *SE) {
+  // DiffUnits = Diff % int64_t(Alignment)
+  const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
+  const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
+  const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+
+  DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " <<
+                  *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+
+  if (const SCEVConstant *ConstDUSCEV =
+      dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
+    int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();
+
+    if (!DiffUnits)
+      return (unsigned)
+        cast<SCEVConstant>(AlignSCEV)->getValue()->getSExtValue();
+
+    uint64_t DiffUnitsAbs = abs64(DiffUnits);
+    if (isPowerOf2_64(DiffUnitsAbs))
+      return (unsigned) DiffUnitsAbs;
+  }
+
+  return 0;
+}
+
+static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
+                                const SCEV *OffSCEV, Value *Ptr,
+                                ScalarEvolution *SE) {
+  const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+  const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
+
+  // What we really want to know if the overall offset to the aligned
+  // address. This address is displaced by the provided offset.
+  DiffSCEV = SE->getAddExpr(DiffSCEV, OffSCEV);
+
+  DEBUG(dbgs() << "assumealign: alignment of " << *Ptr << " relative to " <<
+                  *AlignSCEV << " and offset " << *OffSCEV <<
+                  " using diff " << *DiffSCEV << "\n");
+
+  unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
+  DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+
+  if (NewAlignment) {
+    return NewAlignment;
+  } else if (const SCEVAddRecExpr *DiffARSCEV =
+             dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
+    // The relative offset to the alignment assumption did not yield a constant,
+    // but we should try harder: if we assume that a is 32-byte aligned, then in
+    // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
+    // 32-byte aligned, but instead alternate between 32 and 16-byte alignment.
+    // As a result, the new alignment will not be a constant, but can still
+    // be improved over the default (of 4) to 16.
+
+    const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
+    const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
+
+    DEBUG(dbgs() << "\ttrying start/inc alignment using start " <<
+                    *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+
+    NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
+    unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
+
+    DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
+    DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
+
+    if (NewAlignment > NewIncAlignment) {
+      if (NewAlignment % NewIncAlignment == 0) {
+        DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                        NewIncAlignment << "\n");
+        return NewIncAlignment;
+      }
+    } else if (NewIncAlignment > NewAlignment) {
+      if (NewIncAlignment % NewAlignment == 0) {
+        DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                        NewAlignment << "\n");
+        return NewAlignment;
+      }
+    } else if (NewIncAlignment == NewAlignment && NewIncAlignment) {
+      DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                      NewAlignment << "\n");
+      return NewAlignment;
+    }
+  }
+
+  return 0;
+}
+
+bool AssumeAlign::runOnFunction(Function &F) {
+  SmallVector<CallInst *, 4> AACalls;
+  BasicBlock *EntryBB = F.begin();
+  for (df_iterator<BasicBlock *> I = df_begin(EntryBB), IE = df_end(EntryBB); I != IE; ++I)
+    for (BasicBlock::iterator J = I->getFirstInsertionPt(), JE = I->end();
+         J != JE; ++J)
+      if (CallInst *CI = dyn_cast<CallInst>(J))
+        if (Function *F2 = CI->getCalledFunction())
+          if (F2->getIntrinsicID() == Intrinsic::assume_aligned)
+            AACalls.push_back(CI);
+
+  bool Changed = false;
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+
+  DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments;
+
+  for (SmallVector<CallInst *, 4>::iterator I = AACalls.begin(),
+       IE = AACalls.end(); I != IE; ++I) {
+    const SCEV *AASCEV = SE->getSCEV(*I);
+
+    Value *AAPtr = (*I)->getArgOperand(0);
+    Value *AAAlign = (*I)->getArgOperand(1);
+    Value *AAOffset = (*I)->getArgOperand(2);
+
+    Type *Int64Ty = Type::getInt64Ty(F.getContext());
+    uint64_t Alignment;
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(AAAlign)) {
+      Alignment = CI->getZExtValue();
+    } else {
+      continue;
+    }
+    const SCEV *AlignSCEV = SE->getConstant(Int64Ty, Alignment);
+
+    const SCEV *OffSCEV = SE->getSCEV(AAOffset);
+    unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits();
+    if (OffSCEVBits < 64)
+      OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty);
+    else if (OffSCEVBits > 64)
+      continue;
+
+    DenseSet<Instruction *> Visited;
+    SmallVector<Instruction*, 16> WorkList;
+    for (Value::use_iterator J = (*I)->use_begin(),
+         JE = (*I)->use_end(); J != JE; ++J)
+      if (Instruction *K = dyn_cast<Instruction>(*J))
+        WorkList.push_back(K);
+
+    while (!WorkList.empty()) {
+      Instruction *J = WorkList.pop_back_val();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
+        unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+          LI->getPointerOperand(), SE);
+
+        if (NewAlignment > LI->getAlignment()) {
+          LI->setAlignment(NewAlignment);
+          ++NumLoadAlignChanged;
+          Changed = true;
+        }
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
+        unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+          SI->getPointerOperand(), SE);
+
+        if (NewAlignment > SI->getAlignment()) {
+          SI->setAlignment(NewAlignment);
+          ++NumStoreAlignChanged;
+          Changed = true;
+        }
+      } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
+        unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+          MI->getDest(), SE);
+
+        // For memory transfers, we need a common alignment for both the
+        // source and destination. If we have a new alignment for this
+        // instruction, but only for one operand, save it. If we reach the
+        // other operand through another assumption later, then we may
+        // change the alignment at that point.
+        if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+          unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+            MTI->getSource(), SE);
+
+          DenseMap<MemTransferInst *, unsigned>::iterator DI =
+            NewDestAlignments.find(MTI);
+          unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ?
+                                      0 : DI->second;
+
+          DenseMap<MemTransferInst *, unsigned>::iterator SI =
+            NewSrcAlignments.find(MTI);
+          unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ?
+                                     0 : SI->second;
+
+          DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " <<
+                          AltDestAlignment << " " << NewSrcAlignment <<
+                          " " << AltSrcAlignment << "\n");
+
+          // If these four alignments, pick the largest possible...
+          unsigned NewAlignment = 0;
+          if (NewDestAlignment <= NewSrcAlignment ||
+              NewDestAlignment <= AltSrcAlignment)
+            NewAlignment = std::max(NewAlignment, NewDestAlignment);
+          if (AltDestAlignment <= NewSrcAlignment ||
+              AltDestAlignment <= AltSrcAlignment)
+            NewAlignment = std::max(NewAlignment, AltDestAlignment);
+          if (NewSrcAlignment <= NewDestAlignment ||
+              NewSrcAlignment <= AltDestAlignment)
+            NewAlignment = std::max(NewAlignment, NewSrcAlignment);
+          if (AltSrcAlignment <= NewDestAlignment ||
+              AltSrcAlignment <= AltDestAlignment)
+            NewAlignment = std::max(NewAlignment, AltSrcAlignment);
+
+          if (NewAlignment > MI->getAlignment()) {
+            MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+              MI->getParent()->getContext()), NewAlignment));
+            ++NumMemIntAlignChanged;
+            Changed = true;
+          }
+
+          NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment));
+          NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment));
+        } else if (NewDestAlignment > MI->getAlignment()) {
+          MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+            MI->getParent()->getContext()), NewDestAlignment));
+          ++NumMemIntAlignChanged;
+          Changed = true;
+        }
+      }
+
+      Visited.insert(J);
+      for (Value::use_iterator UJ = J->use_begin(), UE = J->use_end();
+           UJ != UE; ++UJ) {
+        Instruction *K = cast<Instruction>(*UJ);
+        if (!Visited.count(K))
+          WorkList.push_back(cast<Instruction>(*UJ));
+      }
+    }
+
+    // Now that the alignment assumption has been processed, remove it.
+    (*I)->replaceAllUsesWith(AAPtr);
+    (*I)->eraseFromParent();
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
Index: lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- lib/Transforms/Scalar/CMakeLists.txt
+++ lib/Transforms/Scalar/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMScalarOpts
   ADCE.cpp
+  AssumeAlign.cpp
   BasicBlockPlacement.cpp
   CodeGenPrepare.cpp
   ConstantProp.cpp
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -28,6 +28,7 @@
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCEPass(Registry);
+  initializeAssumeAlignPass(Registry);
   initializeBlockPlacementPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeConstantPropagationPass(Registry);
@@ -76,6 +77,10 @@
   unwrap(PM)->add(createAggressiveDCEPass());
 }
 
+void LLVMAddAssumeAlignPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAssumeAlignPass());
+}
+
 void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCFGSimplificationPass());
 }
Index: lib/Transforms/Utils/SimplifyCFG.cpp
===================================================================
--- lib/Transforms/Utils/SimplifyCFG.cpp
+++ lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3890,6 +3890,13 @@
     if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
       return passingValueIsAlwaysUndefined(V, BC);
 
+    // assume_aligned is like a bitcast.
+    if (CallInst *CI = dyn_cast<CallInst>(Use))
+      if (Function *F = CI->getCalledFunction())
+        if (F->getIntrinsicID() == Intrinsic::assume_aligned &&
+            CI->getArgOperand(0) == I)
+          return passingValueIsAlwaysUndefined(V, CI);
+
     // Load from null is undefined.
     if (LoadInst *LI = dyn_cast<LoadInst>(Use))
       return LI->getPointerAddressSpace() == 0;
Index: test/Analysis/BasicAA/assume-aligned.ll
===================================================================
--- /dev/null
+++ test/Analysis/BasicAA/assume-aligned.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+; Check that geps with equal base offsets of noalias base pointers stay noalias.
+; (make sure that basicaa can see though assume.aligned intrinsics calls)
+define i32 @test(i32* %p, i16 %i) {
+  %pi = getelementptr i32* %p, i32 0
+  %pi.next = getelementptr i32* %p, i32 1
+  %b = icmp eq i16 %i, 0
+  br i1 %b, label %bb1, label %bb2
+
+bb1:
+  %f = getelementptr i32* %pi, i32 1
+  %g = getelementptr i32* %pi.next, i32 1
+  br label %bb3
+bb2:
+  %f2a = getelementptr i32* %pi, i32 1
+  %g2a = getelementptr i32* %pi.next, i32 1
+
+  %f2 = call i32* @llvm.assume.aligned.p0i32.i64(i32* %f2a, i32 32, i64 0)
+  %g2 = call i32* @llvm.assume.aligned.p0i32.i64(i32* %g2a, i32 32, i64 0)
+
+  br label %bb3
+
+bb3:
+  %ptr_phi = phi i32* [ %f, %bb1 ], [ %f2, %bb2 ]
+  %ptr_phi2 = phi i32* [ %g, %bb1 ], [ %g2, %bb2 ]
+; CHECK: NoAlias: i32* %f1, i32* %g1
+  %f1 = getelementptr i32* %ptr_phi , i32 1
+  %g1 = getelementptr i32* %ptr_phi2 , i32 1
+
+  ret i32 0
+}
+
+declare i32* @llvm.assume.aligned.p0i32.i64(i32*, i32, i64) nounwind readnone
+
Index: test/Transforms/AssumeAlign/lit.local.cfg
===================================================================
--- /dev/null
+++ test/Transforms/AssumeAlign/lit.local.cfg
@@ -0,0 +1,2 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
Index: test/Transforms/AssumeAlign/simple.ll
===================================================================
--- /dev/null
+++ test/Transforms/AssumeAlign/simple.ll
@@ -0,0 +1,194 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -assumealign -S | FileCheck %s
+
+define i32 @foo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %alignedval = tail call i8* @llvm.assume.aligned.p0i8.i64(i8* %0, i32 32, i64 0)
+  %1 = bitcast i8* %alignedval to i32*
+  %2 = load i32* %1, align 4
+  ret i32 %2
+
+; CHECK: @foo
+; CHECK: load i32* {{[^,]+}}, align 32
+; CHECK: ret i32
+}
+
+define i32 @foo2(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %alignedval = tail call i8* @llvm.assume.aligned.p0i8.i32(i8* %0, i32 32, i32 8)
+  %arrayidx = getelementptr inbounds i8* %alignedval, i64 8
+  %1 = bitcast i8* %arrayidx to i32*
+  %2 = load i32* %1, align 4
+  ret i32 %2
+
+; CHECK: @foo2
+; CHECK: load i32* {{[^,]+}}, align 16
+; CHECK: ret i32
+}
+
+define i32 @foo2a(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %alignedval = tail call i8* @llvm.assume.aligned.p0i8.i32(i8* %0, i32 32, i32 4)
+  %arrayidx = getelementptr inbounds i8* %alignedval, i64 -4
+  %1 = bitcast i8* %arrayidx to i32*
+  %2 = load i32* %1, align 4
+  ret i32 %2
+
+; CHECK: @foo2a
+; CHECK: load i32* {{[^,]+}}, align 32
+; CHECK: ret i32
+}
+
+define i32 @goo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %alignedval = tail call i8* @llvm.assume.aligned.p0i8.i32(i8* %0, i32 32, i32 0)
+  %1 = bitcast i8* %alignedval to i32*
+  %2 = load i32* %1, align 4
+  ret i32 %2
+
+; CHECK: @goo
+; CHECK: load i32* {{[^,]+}}, align 32
+; CHECK: ret i32
+}
+
+define i32 @hoo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %alignedval = tail call i8* @llvm.assume.aligned.p0i8.i64(i8* %0, i32 32, i64 0)
+  %1 = bitcast i8* %alignedval to i32*
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %1, i64 %indvars.iv
+  %2 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %2, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %3 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %3, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK: @hoo
+; CHECK: load i32* %arrayidx, align 32
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @joo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %alignedval = tail call i8* @llvm.assume.aligned.p0i8.i64(i8* %0, i32 32, i64 0)
+  %1 = bitcast i8* %alignedval to i32*
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 4, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %1, i64 %indvars.iv
+  %2 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %2, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %3 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %3, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK: @joo
+; CHECK: load i32* %arrayidx, align 16
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @koo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %alignedval = tail call i8* @llvm.assume.aligned.p0i8.i64(i8* %0, i32 32, i64 0)
+  %1 = bitcast i8* %alignedval to i32*
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %1, i64 %indvars.iv
+  %2 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %2, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 4
+  %3 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %3, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+; CHECK: @koo
+; CHECK: load i32* %arrayidx, align 16
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @koo2(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %0 = bitcast i32* %a to i8*
+  %alignedval = tail call i8* @llvm.assume.aligned.p0i8.i64(i8* %0, i32 32, i64 0)
+  %1 = bitcast i8* %alignedval to i32*
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ -4, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %1, i64 %indvars.iv
+  %2 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %2, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 4
+  %3 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %3, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK: @koo2
+; CHECK: load i32* %arrayidx, align 16
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @moo(i32* nocapture %a) nounwind uwtable {
+entry:
+  %q = bitcast i32* %a to i8*
+  %alignedval = tail call i8* @llvm.assume.aligned.p0i8.i64(i8* %q, i32 32, i64 0)
+  tail call void @llvm.memset.p0i8.i64(i8* %alignedval, i8 0, i64 64, i32 4, i1 false)
+  ret i32 undef
+; CHECK: @moo
+; CHECK: @llvm.memset.p0i8.i64(i8* %q, i8 0, i64 64, i32 32, i1 false)
+; CHECK: ret i32 undef
+}
+
+define i32 @moo2(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  %i0 = bitcast i32* %a to i8*
+  %alignedval = tail call i8* @llvm.assume.aligned.p0i8.i64(i8* %i0, i32 32, i64 0)
+  %i1 = bitcast i32* %b to i8*
+  %alignedval1 = tail call i8* @llvm.assume.aligned.p0i8.i64(i8* %i1, i32 128, i64 0)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %alignedval, i8* %alignedval1, i64 64, i32 4, i1 false)
+  ret i32 undef
+; CHECK: @moo2
+; CHECK: @llvm.memcpy.p0i8.p0i8.i64(i8* %i0, i8* %i1, i64 64, i32 32, i1 false)
+; CHECK: ret i32 undef
+}
+
+declare i8* @llvm.assume.aligned.p0i8.i64(i8*, i32, i64) nounwind readnone
+declare i8* @llvm.assume.aligned.p0i8.i32(i8*, i32, i32) nounwind readnone
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+