Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
===================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -690,6 +690,30 @@
         // TODO: Could compute known zero/one bits based on the input.
         break;
       }
+      case Intrinsic::fshr:
+      case Intrinsic::fshl: {
+        const APInt *SA;
+        if (!match(I->getOperand(2), m_APInt(SA)))
+          break;
+
+        // Normalize to funnel shift left. APInt shifts of BitWidth are well-
+        // defined, so no need to special-case zero shifts here.
+        uint64_t ShiftAmt = SA->urem(BitWidth);
+        if (II->getIntrinsicID() == Intrinsic::fshr)
+          ShiftAmt = BitWidth - ShiftAmt;
+
+        APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt));
+        APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt));
+        if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) ||
+            SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1))
+          return I;
+
+        Known.Zero = LHSKnown.Zero.shl(ShiftAmt) |
+                     RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
+        Known.One = LHSKnown.One.shl(ShiftAmt) |
+                    RHSKnown.One.lshr(BitWidth - ShiftAmt);
+        break;
+      }
       case Intrinsic::x86_mmx_pmovmskb:
       case Intrinsic::x86_sse_movmsk_ps:
       case Intrinsic::x86_sse2_movmsk_pd:
Index: llvm/trunk/test/Transforms/InstCombine/fsh.ll
===================================================================
--- llvm/trunk/test/Transforms/InstCombine/fsh.ll
+++ llvm/trunk/test/Transforms/InstCombine/fsh.ll
@@ -255,7 +255,7 @@
 
 define i32 @fshl_only_op0_demanded(i32 %x, i32 %y) {
 ; CHECK-LABEL: @fshl_only_op0_demanded(
-; CHECK-NEXT:    [[Z:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 7)
+; CHECK-NEXT:    [[Z:%.*]] = shl i32 [[X:%.*]], 7
 ; CHECK-NEXT:    [[R:%.*]] = and i32 [[Z]], 128
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@@ -266,7 +266,7 @@
 
 define i32 @fshl_only_op1_demanded(i32 %x, i32 %y) {
 ; CHECK-LABEL: @fshl_only_op1_demanded(
-; CHECK-NEXT:    [[Z:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 7)
+; CHECK-NEXT:    [[Z:%.*]] = lshr i32 [[Y:%.*]], 25
 ; CHECK-NEXT:    [[R:%.*]] = and i32 [[Z]], 63
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@@ -275,9 +275,9 @@
   ret i32 %r
 }
 
-define i33 @fshr_only_op0_demanded(i33 %x, i33 %y) {
-; CHECK-LABEL: @fshr_only_op0_demanded(
-; CHECK-NEXT:    [[Z:%.*]] = call i33 @llvm.fshr.i33(i33 [[X:%.*]], i33 [[Y:%.*]], i33 7)
+define i33 @fshr_only_op1_demanded(i33 %x, i33 %y) {
+; CHECK-LABEL: @fshr_only_op1_demanded(
+; CHECK-NEXT:    [[Z:%.*]] = lshr i33 [[Y:%.*]], 7
 ; CHECK-NEXT:    [[R:%.*]] = and i33 [[Z]], 12392
 ; CHECK-NEXT:    ret i33 [[R]]
 ;
@@ -286,10 +286,10 @@
   ret i33 %r
 }
 
-define i33 @fshr_only_op1_demanded(i33 %x, i33 %y) {
-; CHECK-LABEL: @fshr_only_op1_demanded(
-; CHECK-NEXT:    [[Z:%.*]] = call i33 @llvm.fshr.i33(i33 [[X:%.*]], i33 [[Y:%.*]], i33 7)
-; CHECK-NEXT:    [[R:%.*]] = lshr i33 [[Z]], 30
+define i33 @fshr_only_op0_demanded(i33 %x, i33 %y) {
+; CHECK-LABEL: @fshr_only_op0_demanded(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i33 [[X:%.*]], 4
+; CHECK-NEXT:    [[R:%.*]] = and i33 [[TMP1]], 7
 ; CHECK-NEXT:    ret i33 [[R]]
 ;
   %z = call i33 @llvm.fshr.i33(i33 %x, i33 %y, i33 7)
@@ -297,6 +297,29 @@
   ret i33 %r
 }
 
+define <2 x i31> @fshl_only_op1_demanded_vec_splat(<2 x i31> %x, <2 x i31> %y) {
+; CHECK-LABEL: @fshl_only_op1_demanded_vec_splat(
+; CHECK-NEXT:    [[Z:%.*]] = lshr <2 x i31> [[Y:%.*]], <i31 24, i31 24>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i31> [[Z]], <i31 63, i31 31>
+; CHECK-NEXT:    ret <2 x i31> [[R]]
+;
+  %z = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> %x, <2 x i31> %y, <2 x i31> <i31 7, i31 7>)
+  %r = and <2 x i31> %z, <i31 63, i31 31>
+  ret <2 x i31> %r
+}
+
+; The shift modulo bitwidth is the same for all vector elements, but this is not simplified yet.
+define <2 x i31> @fshl_only_op1_demanded_vec_nonsplat(<2 x i31> %x, <2 x i31> %y) {
+; CHECK-LABEL: @fshl_only_op1_demanded_vec_nonsplat(
+; CHECK-NEXT:    [[Z:%.*]] = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> [[X:%.*]], <2 x i31> [[Y:%.*]], <2 x i31> <i31 7, i31 38>)
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i31> [[Z]], <i31 63, i31 31>
+; CHECK-NEXT:    ret <2 x i31> [[R]]
+;
+  %z = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> %x, <2 x i31> %y, <2 x i31> <i31 7, i31 38>)
+  %r = and <2 x i31> %z, <i31 63, i31 31>
+  ret <2 x i31> %r
+}
+
 ; Demand bits from both operands -- cannot simplify.
 
 define i32 @fshl_both_ops_demanded(i32 %x, i32 %y) {
@@ -325,11 +348,7 @@
 
 define i32 @fshl_known_bits(i32 %x, i32 %y) {
 ; CHECK-LABEL: @fshl_known_bits(
-; CHECK-NEXT:    [[X2:%.*]] = or i32 [[X:%.*]], 1
-; CHECK-NEXT:    [[Y2:%.*]] = lshr i32 [[Y:%.*]], 1
-; CHECK-NEXT:    [[Z:%.*]] = call i32 @llvm.fshl.i32(i32 [[X2]], i32 [[Y2]], i32 7)
-; CHECK-NEXT:    [[R:%.*]] = and i32 [[Z]], 192
-; CHECK-NEXT:    ret i32 [[R]]
+; CHECK-NEXT:    ret i32 128
 ;
   %x2 = or i32 %x, 1   ; lo bit set
   %y2 = lshr i32 %y, 1 ; hi bit clear
@@ -340,11 +359,7 @@
 
 define i33 @fshr_known_bits(i33 %x, i33 %y) {
 ; CHECK-LABEL: @fshr_known_bits(
-; CHECK-NEXT:    [[X2:%.*]] = or i33 [[X:%.*]], 1
-; CHECK-NEXT:    [[Y2:%.*]] = lshr i33 [[Y:%.*]], 1
-; CHECK-NEXT:    [[Z:%.*]] = call i33 @llvm.fshr.i33(i33 [[X2]], i33 [[Y2]], i33 26)
-; CHECK-NEXT:    [[R:%.*]] = and i33 [[Z]], 192
-; CHECK-NEXT:    ret i33 [[R]]
+; CHECK-NEXT:    ret i33 128
 ;
   %x2 = or i33 %x, 1 ; lo bit set
   %y2 = lshr i33 %y, 1 ; hi bit set