Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -1172,19 +1172,6 @@ llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; } -// Vector extract and insert -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx_vextractf128_pd_256 : - GCCBuiltin<"__builtin_ia32_vextractf128_pd256">, - Intrinsic<[llvm_v2f64_ty], [llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx_vextractf128_ps_256 : - GCCBuiltin<"__builtin_ia32_vextractf128_ps256">, - Intrinsic<[llvm_v4f32_ty], [llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx_vextractf128_si_256 : - GCCBuiltin<"__builtin_ia32_vextractf128_si256">, - Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; -} - // Vector convert let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_cvtdq2_pd_256 : GCCBuiltin<"__builtin_ia32_cvtdq2pd256">, Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4978,9 +4978,6 @@ setValue(&I, Res); return nullptr; } - case Intrinsic::x86_avx_vextractf128_pd_256: - case Intrinsic::x86_avx_vextractf128_ps_256: - case Intrinsic::x86_avx_vextractf128_si_256: case Intrinsic::x86_avx2_vextracti128: { EVT DestVT = TLI.getValueType(I.getType()); uint64_t Idx = (cast(I.getArgOperand(1))->getZExtValue() & 1) * Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -161,6 +161,9 @@ Name == "x86.avx.vinsertf128.pd.256" || Name == "x86.avx.vinsertf128.ps.256" || Name == "x86.avx.vinsertf128.si.256" || + Name == "x86.avx.vextractf128.pd.256" || + Name == "x86.avx.vextractf128.ps.256" || + Name == "x86.avx.vextractf128.si.256" || Name == "x86.avx.movnt.dq.256" || Name == "x86.avx.movnt.pd.256" || Name == "x86.avx.movnt.ps.256" || @@ -676,6 +679,26 @@ Idxs2.push_back(Builder.getInt32(Idx)); } Rep = Builder.CreateShuffleVector(Op0, Rep, ConstantVector::get(Idxs2)); + } else if (Name == "llvm.x86.avx.vextractf128.pd.256" || + Name == "llvm.x86.avx.vextractf128.ps.256" || + Name == "llvm.x86.avx.vextractf128.si.256") { + Value *Op0 = CI->getArgOperand(0); + unsigned Imm = cast(CI->getArgOperand(1))->getZExtValue(); + VectorType *VecTy = cast(CI->getType()); + unsigned NumElts = VecTy->getNumElements(); + + // Mask off the high bits of the immediate value; hardware ignores those. + Imm = Imm & 1; + + // Get indexes for either the high half or low half of the input vector. + SmallVector Idxs(NumElts); + for (unsigned i = 0; i != NumElts; ++i) { + unsigned Idx = Imm ? (i + NumElts) : i; + Idxs[i] = Builder.getInt32(Idx); + } + + Value *UndefV = UndefValue::get(Op0->getType()); + Rep = Builder.CreateShuffleVector(Op0, UndefV, ConstantVector::get(Idxs)); } else { bool PD128 = false, PD256 = false, PS128 = false, PS256 = false; if (Name == "llvm.x86.avx.vpermil.pd.256") Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -36,6 +36,43 @@ } declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone +; We don't check any vextractf128 variant with immediate 0 because that's just a move. + +define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) { +; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1: +; CHECK: vextractf128 $1, %ymm0, %xmm0 + %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1) + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone + +define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) { +; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1: +; CHECK: vextractf128 $1, %ymm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1) + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone + +define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) { +; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1: +; CHECK: vextractf128 $1, %ymm0, %xmm0 + %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone + +; Verify that high bits of the immediate are masked off. This should be the equivalent +; of a vextractf128 $0 which should be optimized away, so just check that it's +; not a vextractf128 of any kind. +define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) { +; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2: +; CHECK-NOT: vextractf128 + %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2) + ret <2 x double> %res +} + + define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: test_x86_avx_blend_pd_256: ; CHECK: vblendpd Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -2163,30 +2163,6 @@ declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly -define <2 x double> @test_x86_avx_vextractf128_pd_256(<4 x double> %a0) { - ; CHECK: vextractf128 - %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 7) ; <<2 x double>> [#uses=1] - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone - - -define <4 x float> @test_x86_avx_vextractf128_ps_256(<8 x float> %a0) { - ; CHECK: vextractf128 - %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 7) ; <<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone - - -define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) { - ; CHECK: vextractf128 - %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 7) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone - - define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) { ; CHECK: vperm2f128 %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]