Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -34631,7 +34631,7 @@ return SDValue(); ShrinkMode Mode; - if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode)) + if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16) return SDValue(); EVT VT = N->getValueType(0); Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -54,6 +54,48 @@ ret i32 %13 } +;SSE2-label: @_Z10test_shortPsS_i +;SSE2-NOT: pmaddwd + +;AVX2-label: @_Z10test_shortPsS_i +;AVX2-NOT: vpmaddwd + +;AVX512-label: @_Z10test_shortPsS_i +;AVX512-NOT: vpmaddwd + +define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 { +entry: + %3 = zext i32 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ] + %4 = getelementptr inbounds i16, i16* %0, i64 %index + %5 = bitcast i16* %4 to <8 x i16>* + %wide.load = load <8 x i16>, <8 x i16>* %5, align 2 + %6 = zext <8 x i16> %wide.load to <8 x i32> + %7 = getelementptr inbounds i16, i16* %1, i64 %index + %8 = bitcast i16* %7 to <8 x i16>* + %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2 + %9 = zext <8 x i16> %wide.load14 to <8 x i32> + %10 = mul nsw <8 x i32> %9, %6 + %11 = add nsw <8 x i32> %10, %vec.phi + %index.next = add i64 %index, 8 + %12 = icmp eq i64 %index.next, %3 + br i1 %12, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> + %bin.rdx = add <8 x i32> %11, %rdx.shuf + %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15 + %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> + %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17 + %13 = extractelement <8 x i32> %bin.rdx18, i32 0 + ret i32 %13 +} + ;AVX2-label: @_Z9test_charPcS_i ;AVX2: vpmovsxbw ;AVX2-NEXT: vpmovsxbw