Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -69,8 +69,13 @@ if (I->getOpcode() == Instruction::InsertElement && isConstant && isa(I->getOperand(2))) return true; - if (I->getOpcode() == Instruction::Load && I->hasOneUse()) + if (I->hasOneUse()) { + if (I->getOpcode() == Instruction::Load || + // FIXME: Support other types? + (isConstant && isa(I) && + V->getType()->getScalarSizeInBits() == 16)) return true; + } if (BinaryOperator *BO = dyn_cast(I)) if (BO->hasOneUse() && (cheapToScalarize(BO->getOperand(0), isConstant) || Index: test/Transforms/InstCombine/fast-math-scalarization.ll =================================================================== --- test/Transforms/InstCombine/fast-math-scalarization.ll +++ test/Transforms/InstCombine/fast-math-scalarization.ll @@ -28,7 +28,7 @@ ret void } -; CHECK-LABEL: test_extract_element_fastmath +; CHECK-LABEL: @test_extract_element_fastmath ; CHECK: fadd fast float define float @test_extract_element_fastmath(<4 x float> %x) #0 { entry: @@ -37,3 +37,77 @@ ret float %0 } +; CHECK-LABEL: @scalarize_shuffles_half2 +; CHECK: extractelement +; CHECK-NEXT: extractelement +; CHECK-NEXT: fadd fast half +; CHECK-NEXT: ret +define half @scalarize_shuffles_half2(<2 x half> %x, <2 x half> %y) { +entry: + %shuff1 = shufflevector <2 x half> %x, <2 x half> %y, <2 x i32> + %shuff2 = shufflevector <2 x half> %y, <2 x half> %x, <2 x i32> + %vadd = fadd fast <2 x half> %shuff1, %shuff2 + %scalar = extractelement <2 x half> %vadd, i32 0 + ret half %scalar +} + +; CHECK-LABEL: @scalarize_shuffles_2i16 +; CHECK: extractelement +; CHECK-NEXT: extractelement +; CHECK: add i16 +; CHECK-NEXT: ret +define i16 @scalarize_shuffles_2i16(<2 x i16> %x, <2 x i16> %y) { +entry: + %shuff1 = shufflevector <2 x i16> %x, <2 x i16> %y, <2 x i32> + %shuff2 = shufflevector <2 x i16> %y, <2 x i16> %x, <2 x i32> + %vadd = add <2 x i16> %shuff1, %shuff2 + %scalar = extractelement <2 x i16> %vadd, i32 0 + ret i16 %scalar +} + +; CHECK-LABEL: @scalarize_multiplebinops +; CHECK: extractelement +; CHECK-NEXT: extractelement +; CHECK-NEXT: extractelement +; CHECK-NEXT: fadd fast half +; CHECK-NEXT: fadd fast half +; CHECK-NEXT: ret +define half @scalarize_multiplebinops(<2 x half> %x, <2 x half> %y) { +entry: + %x.1 = shufflevector <2 x half> %x, <2 x half> undef, <2 x i32> + %shuff1 = shufflevector <2 x half> %x, <2 x half> %y, <2 x i32> + %shuff2 = shufflevector <2 x half> %y, <2 x half> %x, <2 x i32> + %vadd1 = fadd fast <2 x half> %shuff1, %shuff2 + %vadd2 = fadd fast <2 x half> %x.1, %vadd1 + %scalar = extractelement <2 x half> %vadd2, i32 0 + ret half %scalar +} + +; CHECK-LABEL: scalarize_reduction_half2 +; CHECK: extractelement +; CHECK-NEXT: extractelement +; CHECK: fadd fast half +; CHECK: ret +define half @scalarize_reduction_half2(<2 x half> %x) { +entry: + %rdx.shuf1 = shufflevector <2 x half> %x, <2 x half> undef, <2 x i32> + %add = fadd fast <2 x half> %x, %rdx.shuf1 + %scalar = extractelement <2 x half> %add, i32 0 + ret half %scalar +} + +; CHECK-LABEL: @scalarize_reduction_float2 +; TODO: The following test should be scalarized also. cheapToScalarize() in InstCombineVectorOps.cpp is +; restricted to type half only. This is because allowing it for other types might cause performance regression +; on other targets(such as X86). When there is performance regression this is an artifact of pattern matching being +; too restrictive, and should be fixed. +; For example, X86(isHorizontalBinOp) relies on shuffle pattern to +; detect horizontal binary operations. Absense of the shuffle causes isHorizontalBinOp to not match the horizontalbinop +; pattern and produces vmovshdup+vaddss instead of vhaddps on avx2; +define float @scalarize_reduction_float2(<2 x float> %x) { +entry: + %rdx.shuf1 = shufflevector <2 x float> %x, <2 x float> undef, <2 x i32> + %add = fadd fast <2 x float> %x, %rdx.shuf1 + %scalar = extractelement <2 x float> %add, i32 0 + ret float %scalar +}