Index: llvm/trunk/lib/Target/X86/X86InstrCompiler.td
===================================================================
--- llvm/trunk/lib/Target/X86/X86InstrCompiler.td
+++ llvm/trunk/lib/Target/X86/X86InstrCompiler.td
@@ -1279,14 +1279,16 @@
 
 // For other extloads, use subregs, since the high contents of the register are
 // defined after an extload.
+// NOTE: The extloadi64i32 pattern needs to be first as it will try to form
+// 32-bit loads for 4 byte aligned i8/i16 loads.
+def : Pat<(extloadi64i32 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
 def : Pat<(extloadi64i1 addr:$src),
           (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 def : Pat<(extloadi64i8 addr:$src),
           (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 def : Pat<(extloadi64i16 addr:$src),
           (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
-def : Pat<(extloadi64i32 addr:$src),
-          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
 
 // anyext. Define these to do an explicit zero-extend to
 // avoid partial-register updates.
Index: llvm/trunk/lib/Target/X86/X86InstrInfo.td
===================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.td
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.td
@@ -1121,7 +1121,19 @@
 def extloadi64i1   : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
 def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
 def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
-def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+// We can treat an i8/i16 extending load to i64 as a 32 bit load if its known
+// to be 4 byte aligned or better.
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType != ISD::EXTLOAD)
+    return false;
+  if (LD->getMemoryVT() == MVT::i32)
+    return true;
+
+  return LD->getAlignment() >= 4 && !LD->isVolatile();
+}]>;
 
 
 // An 'and' node with a single use.
Index: llvm/trunk/test/CodeGen/X86/fp128-cast.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/fp128-cast.ll
+++ llvm/trunk/test/CodeGen/X86/fp128-cast.ll
@@ -413,7 +413,7 @@
 ; X64-NEXT:    fstpt (%rsp)
 ; X64-NEXT:    movq (%rsp), %rax
 ; X64-NEXT:    movq %rax, {{.*}}(%rip)
-; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    movw %ax, vf80+{{.*}}(%rip)
 ; X64-NEXT:    addq $24, %rsp
 ; X64-NEXT:    retq
Index: llvm/trunk/test/CodeGen/X86/vector-sext-widen.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-sext-widen.ll
+++ llvm/trunk/test/CodeGen/X86/vector-sext-widen.ll
@@ -1494,7 +1494,7 @@
 define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
 ; SSE2-LABEL: load_sext_4i1_to_4i32:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movzbl (%rdi), %eax
+; SSE2-NEXT:    movl (%rdi), %eax
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shlq $60, %rcx
 ; SSE2-NEXT:    sarq $63, %rcx
@@ -1517,7 +1517,7 @@
 ;
 ; SSSE3-LABEL: load_sext_4i1_to_4i32:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movzbl (%rdi), %eax
+; SSSE3-NEXT:    movl (%rdi), %eax
 ; SSSE3-NEXT:    movq %rax, %rcx
 ; SSSE3-NEXT:    shlq $60, %rcx
 ; SSSE3-NEXT:    sarq $63, %rcx
@@ -1540,7 +1540,7 @@
 ;
 ; SSE41-LABEL: load_sext_4i1_to_4i32:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    movzbl (%rdi), %eax
+; SSE41-NEXT:    movl (%rdi), %eax
 ; SSE41-NEXT:    movq %rax, %rcx
 ; SSE41-NEXT:    shlq $62, %rcx
 ; SSE41-NEXT:    sarq $63, %rcx
@@ -1560,7 +1560,7 @@
 ;
 ; AVX1-LABEL: load_sext_4i1_to_4i32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    movzbl (%rdi), %eax
+; AVX1-NEXT:    movl (%rdi), %eax
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shlq $62, %rcx
 ; AVX1-NEXT:    sarq $63, %rcx
@@ -1580,7 +1580,7 @@
 ;
 ; AVX2-LABEL: load_sext_4i1_to_4i32:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movzbl (%rdi), %eax
+; AVX2-NEXT:    movl (%rdi), %eax
 ; AVX2-NEXT:    movq %rax, %rcx
 ; AVX2-NEXT:    shlq $62, %rcx
 ; AVX2-NEXT:    sarq $63, %rcx
@@ -1781,7 +1781,7 @@
 ;
 ; AVX1-LABEL: load_sext_4i1_to_4i64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    movzbl (%rdi), %eax
+; AVX1-NEXT:    movl (%rdi), %eax
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shlq $62, %rcx
 ; AVX1-NEXT:    sarq $63, %rcx
@@ -1805,7 +1805,7 @@
 ;
 ; AVX2-LABEL: load_sext_4i1_to_4i64:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movzbl (%rdi), %eax
+; AVX2-NEXT:    movl (%rdi), %eax
 ; AVX2-NEXT:    movq %rax, %rcx
 ; AVX2-NEXT:    shlq $60, %rcx
 ; AVX2-NEXT:    sarq $63, %rcx
Index: llvm/trunk/test/CodeGen/X86/vector-sext.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vector-sext.ll
+++ llvm/trunk/test/CodeGen/X86/vector-sext.ll
@@ -1494,7 +1494,7 @@
 define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
 ; SSE2-LABEL: load_sext_4i1_to_4i32:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movzbl (%rdi), %eax
+; SSE2-NEXT:    movl (%rdi), %eax
 ; SSE2-NEXT:    movq %rax, %rcx
 ; SSE2-NEXT:    shlq $60, %rcx
 ; SSE2-NEXT:    sarq $63, %rcx
@@ -1517,7 +1517,7 @@
 ;
 ; SSSE3-LABEL: load_sext_4i1_to_4i32:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movzbl (%rdi), %eax
+; SSSE3-NEXT:    movl (%rdi), %eax
 ; SSSE3-NEXT:    movq %rax, %rcx
 ; SSSE3-NEXT:    shlq $60, %rcx
 ; SSSE3-NEXT:    sarq $63, %rcx
@@ -1540,7 +1540,7 @@
 ;
 ; SSE41-LABEL: load_sext_4i1_to_4i32:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    movzbl (%rdi), %eax
+; SSE41-NEXT:    movl (%rdi), %eax
 ; SSE41-NEXT:    movq %rax, %rcx
 ; SSE41-NEXT:    shlq $62, %rcx
 ; SSE41-NEXT:    sarq $63, %rcx
@@ -1560,7 +1560,7 @@
 ;
 ; AVX1-LABEL: load_sext_4i1_to_4i32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    movzbl (%rdi), %eax
+; AVX1-NEXT:    movl (%rdi), %eax
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shlq $62, %rcx
 ; AVX1-NEXT:    sarq $63, %rcx
@@ -1580,7 +1580,7 @@
 ;
 ; AVX2-LABEL: load_sext_4i1_to_4i32:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movzbl (%rdi), %eax
+; AVX2-NEXT:    movl (%rdi), %eax
 ; AVX2-NEXT:    movq %rax, %rcx
 ; AVX2-NEXT:    shlq $62, %rcx
 ; AVX2-NEXT:    sarq $63, %rcx
@@ -1781,7 +1781,7 @@
 ;
 ; AVX1-LABEL: load_sext_4i1_to_4i64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    movzbl (%rdi), %eax
+; AVX1-NEXT:    movl (%rdi), %eax
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shlq $62, %rcx
 ; AVX1-NEXT:    sarq $63, %rcx
@@ -1805,7 +1805,7 @@
 ;
 ; AVX2-LABEL: load_sext_4i1_to_4i64:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movzbl (%rdi), %eax
+; AVX2-NEXT:    movl (%rdi), %eax
 ; AVX2-NEXT:    movq %rax, %rcx
 ; AVX2-NEXT:    shlq $60, %rcx
 ; AVX2-NEXT:    sarq $63, %rcx
Index: llvm/trunk/test/CodeGen/X86/zext-logicop-shift-load.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/zext-logicop-shift-load.ll
+++ llvm/trunk/test/CodeGen/X86/zext-logicop-shift-load.ll
@@ -5,7 +5,7 @@
 define i64 @test1(i8* %data) {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    shlq $2, %rax
 ; CHECK-NEXT:    andl $60, %eax
 ; CHECK-NEXT:    retq
@@ -20,7 +20,7 @@
 define i8* @test2(i8* %data) {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    andl $15, %eax
 ; CHECK-NEXT:    leaq (%rdi,%rax,4), %rax
 ; CHECK-NEXT:    retq
@@ -53,7 +53,7 @@
 define i64 @test4(i8* %data) {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    shrq $2, %rax
 ; CHECK-NEXT:    andl $60, %eax
 ; CHECK-NEXT:    retq