Index: lib/Transforms/InstCombine/InstCombineCompares.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -909,8 +909,15 @@
           }
 
       // If all indices are the same, just compare the base pointers.
-      if (IndicesTheSame)
-        return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));
+      if (IndicesTheSame) {
+        ICmpInst *ICmp = new ICmpInst(Cond,
+                                      GEPLHS->getOperand(0),
+                                      GEPRHS->getOperand(0));
+        // Because of vector icmps, we have to check types.
+        if (I.getType() == ICmp->getType())
+          return ICmp;
+        delete ICmp;
+      }
 
       // If we're comparing GEPs with two base pointers that only differ in type
       // and both GEPs have only constant indices or just one use, then fold
Index: test/Transforms/InstCombine/pr38984.ll
===================================================================
--- test/Transforms/InstCombine/pr38984.ll
+++ test/Transforms/InstCombine/pr38984.ll
@@ -2,24 +2,40 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "p:16:16"
 
-@offsets = external dso_local global [4 x i16], align 1
+@a = external global [21 x i16], align 1
+@offsets = external global [4 x i16], align 1
 
-define void @PR38984() {
-; CHECK-LABEL: @PR38984(
+; The "same gep" optimization should work with vector icmp.
+define <4 x i1> @PR38984_1() {
+; CHECK-LABEL: @PR38984_1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <4 x i1> <i1 true, i1 true, i1 true, i1 true>
 ;
 entry:
   %0 = load i16, i16* getelementptr ([4 x i16], [4 x i16]* @offsets, i16 0, i16 undef), align 1
   %1 = insertelement <4 x i16> undef, i16 %0, i32 3
-  %2 = sub <4 x i16> zeroinitializer, %1
-  %3 = sext <4 x i16> %2 to <4 x i32>
-  %4 = getelementptr inbounds i64, i64* null, <4 x i32> %3
-  %5 = ptrtoint <4 x i64*> %4 to <4 x i32>
-  %6 = getelementptr inbounds i64, i64* null, <4 x i16> %2
-  %7 = ptrtoint <4 x i64*> %6 to <4 x i32>
-  %8 = icmp eq <4 x i32> %5, %7
-  %9 = select <4 x i1> %8, <4 x i16> zeroinitializer, <4 x i16> <i16 1, i16 1, i16 1, i16 1>
-  %10 = sext <4 x i16> %9 to <4 x i32>
-  ret void
+  %2 = getelementptr i32, i32* null, <4 x i16> %1
+  %3 = getelementptr i32, i32* null, <4 x i16> %1
+  %4 = icmp eq <4 x i32*> %2, %3
+  ret <4 x i1> %4
+}
+
+; The "compare base pointers" optimization should not kick in for vector icmp.
+define <4 x i1> @PR38984_2() {
+; CHECK-LABEL: @PR38984_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* getelementptr ([4 x i16], [4 x i16]* @offsets, i16 0, i16 undef), align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, i16* getelementptr inbounds ([21 x i16], [21 x i16]* @a, i16 1, i16 0), <4 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, i16* null, <4 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i16*> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i1> [[TMP4]]
+;
+entry:
+  %0 = load i16, i16* getelementptr ([4 x i16], [4 x i16]* @offsets, i16 0, i16 undef)
+  %1 = insertelement <4 x i16> undef, i16 %0, i32 3
+  %2 = getelementptr i16, i16* getelementptr ([21 x i16], [21 x i16]* @a, i64 1, i32 0), <4 x i16> %1
+  %3 = getelementptr i16, i16* null, <4 x i16> %1
+  %4 = icmp eq <4 x i16*> %2, %3
+  ret <4 x i1> %4
 }