Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -1183,19 +1183,6 @@ def int_x86_avx_vextractf128_si_256 : GCCBuiltin<"__builtin_ia32_vextractf128_si256">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx_vinsertf128_pd_256 : - GCCBuiltin<"__builtin_ia32_vinsertf128_pd256">, - Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, - llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx_vinsertf128_ps_256 : - GCCBuiltin<"__builtin_ia32_vinsertf128_ps256">, - Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, - llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx_vinsertf128_si_256 : - GCCBuiltin<"__builtin_ia32_vinsertf128_si256">, - Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, - llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; } // Vector convert Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4956,9 +4956,6 @@ setValue(&I, Res); return nullptr; } - case Intrinsic::x86_avx_vinsertf128_pd_256: - case Intrinsic::x86_avx_vinsertf128_ps_256: - case Intrinsic::x86_avx_vinsertf128_si_256: case Intrinsic::x86_avx2_vinserti128: { EVT DestVT = TLI.getValueType(I.getType()); EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType()); Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -7,7 +7,9 @@ // //===----------------------------------------------------------------------===// // -// This file implements the auto-upgrade helper functions +// This file implements the auto-upgrade helper functions. +// This is where deprecated IR intrinsics and other IR features are updated to +// current specifications. // //===----------------------------------------------------------------------===// @@ -156,6 +158,9 @@ Name.startswith("x86.avx2.pcmpeq.") || Name.startswith("x86.avx2.pcmpgt.") || Name.startswith("x86.avx.vpermil.") || + Name == "x86.avx.vinsertf128.pd.256" || + Name == "x86.avx.vinsertf128.ps.256" || + Name == "x86.avx.vinsertf128.si.256" || Name == "x86.avx.movnt.dq.256" || Name == "x86.avx.movnt.pd.256" || Name == "x86.avx.movnt.ps.256" || @@ -626,6 +631,51 @@ } Rep = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs)); + } else if (Name == "llvm.x86.avx.vinsertf128.pd.256" || + Name == "llvm.x86.avx.vinsertf128.ps.256" || + Name == "llvm.x86.avx.vinsertf128.si.256") { + Value *Op0 = CI->getArgOperand(0); + Value *Op1 = CI->getArgOperand(1); + unsigned Imm = cast(CI->getArgOperand(2))->getZExtValue(); + VectorType *VecTy = cast(CI->getType()); + unsigned NumElts = VecTy->getNumElements(); + + // Mask off the high bits of the immediate value; hardware ignores those. + Imm = Imm & 1; + + // Extend the second operand into a vector that is twice as big. + Value *UndefV = UndefValue::get(Op1->getType()); + SmallVector Idxs; + for (unsigned i = 0; i != NumElts; ++i) { + Idxs.push_back(Builder.getInt32(i)); + } + Rep = Builder.CreateShuffleVector(Op1, UndefV, ConstantVector::get(Idxs)); + + // Insert the second operand into the first operand. + + // Note that there is no guarantee that instruction lowering will actually + // produce a vinsertf128 instruction for the created shuffles. In + // particular, the 0 immediate case involves no lane changes, so it can + // be handled as a blend. + + // Example of shuffle mask for 32-bit elements: + // Imm = 1 + // Imm = 0 + + SmallVector Idxs2; + // The low half of the result is either the low half of the 1st operand + // or the low half of the 2nd operand (the inserted vector). + for (unsigned i = 0; i != NumElts / 2; ++i) { + unsigned Idx = Imm ? i : (i + NumElts); + Idxs2.push_back(Builder.getInt32(Idx)); + } + // The high half of the result is either the low half of the 2nd operand + // (the inserted vector) or the high half of the 1st operand. + for (unsigned i = NumElts / 2; i != NumElts; ++i) { + unsigned Idx = Imm ? (i + NumElts / 2) : i; + Idxs2.push_back(Builder.getInt32(Idx)); + } + Rep = Builder.CreateShuffleVector(Op0, Rep, ConstantVector::get(Idxs2)); } else { bool PD128 = false, PD256 = false, PS128 = false, PS256 = false; if (Name == "llvm.x86.avx.vpermil.pd.256") Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -1,5 +1,41 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s +; We don't check any vinsertf128 variant with immediate 0 because that's just a blend. + +define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1: + ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone + +define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1: + ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone + +define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1: + ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1) + ret <8 x i32> %res +} + +; Verify that high bits of the immediate are masked off. This should be the equivalent +; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's +; not a vinsertf128 $1. +define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: + ; CHECK-NOT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone + define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { ; CHECK: vblendpd %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1] Index: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll +++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -2187,30 +2187,6 @@ declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone -define <4 x double> @test_x86_avx_vinsertf128_pd_256(<4 x double> %a0, <2 x double> %a1) { - ; CHECK: vinsertf128 - %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 7) ; <<4 x double>> [#uses=1] - ret <4 x double> %res -} -declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone - - -define <8 x float> @test_x86_avx_vinsertf128_ps_256(<8 x float> %a0, <4 x float> %a1) { - ; CHECK: vinsertf128 - %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 7) ; <<8 x float>> [#uses=1] - ret <8 x float> %res -} -declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone - - -define <8 x i32> @test_x86_avx_vinsertf128_si_256(<8 x i32> %a0, <4 x i32> %a1) { - ; CHECK: vinsertf128 - %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone - - define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) { ; CHECK: vperm2f128 %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1] Index: llvm/trunk/test/CodeGen/X86/avx-vinsertf128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-vinsertf128.ll +++ llvm/trunk/test/CodeGen/X86/avx-vinsertf128.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=CHECK-SSE %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s +; CHECK-LABEL: A: ; CHECK-NOT: vunpck ; CHECK: vinsertf128 $1 define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp { @@ -9,6 +9,7 @@ ret <8 x float> %shuffle } +; CHECK-LABEL: B: ; CHECK-NOT: vunpck ; CHECK: vinsertf128 $1 define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp { @@ -22,7 +23,7 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone ; Just check that no crash happens -; CHECK-SSE: _insert_crash +; CHECK-LABEL: _insert_crash: define void @insert_crash() nounwind { allocas: %v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> @@ -39,7 +40,7 @@ ;; DAG Combine must remove useless vinsertf128 instructions -; CHECK: DAGCombineA +; CHECK-LABEL: DAGCombineA: ; CHECK-NOT: vinsertf128 $1 define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly { %1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> @@ -47,7 +48,7 @@ ret <4 x i32> %2 } -; CHECK: DAGCombineB +; CHECK-LABEL: DAGCombineB: ; CHECK: vpaddd %xmm ; CHECK-NOT: vinsertf128 $1 ; CHECK: vpaddd %xmm @@ -57,14 +58,7 @@ ret <8 x i32> %2 } -; CHECK: insert_pd -define <4 x double> @insert_pd(<4 x double> %a0, <2 x double> %a1) { -; CHECK: vinsertf128 -%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 0) -ret <4 x double> %res -} - -; CHECK: insert_undef_pd +; CHECK-LABEL: insert_undef_pd: define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) { ; CHECK: vmovaps %ymm1, %ymm0 %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0) @@ -73,14 +67,7 @@ declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone -; CHECK: insert_ps -define <8 x float> @insert_ps(<8 x float> %a0, <4 x float> %a1) { -; CHECK: vinsertf128 -%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 0) -ret <8 x float> %res -} - -; CHECK: insert_undef_ps +; CHECK-LABEL: insert_undef_ps: define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) { ; CHECK: vmovaps %ymm1, %ymm0 %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0) @@ -89,14 +76,7 @@ declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone -; CHECK: insert_si -define <8 x i32> @insert_si(<8 x i32> %a0, <4 x i32> %a1) { -; CHECK: vinsertf128 -%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 0) -ret <8 x i32> %res -} - -; CHECK: insert_undef_si +; CHECK-LABEL: insert_undef_si: define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK: vmovaps %ymm1, %ymm0 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0) @@ -105,7 +85,7 @@ declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone ; rdar://10643481 -; CHECK: vinsertf128_combine +; CHECK-LABEL: vinsertf128_combine: define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp { ; CHECK-NOT: vmovaps ; CHECK: vinsertf128 @@ -118,7 +98,7 @@ } ; rdar://11076953 -; CHECK: vinsertf128_ucombine +; CHECK-LABEL: vinsertf128_ucombine: define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp { ; CHECK-NOT: vmovups ; CHECK: vinsertf128 Index: llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -48,58 +48,6 @@ ; Merge two consecutive 16-byte subvector loads into a single 32-byte load ; if it's faster. -declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) - -; Use the vinsertf128 intrinsic to model source code -; that explicitly uses AVX intrinsics. -define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) { - ; CHECK-LABEL: combine_16_byte_loads - - ; SANDYB: vmovups - ; SANDYB-NEXT: vinsertf128 - ; SANDYB-NEXT: retq - - ; BTVER2: vmovups - ; BTVER2-NEXT: retq - - ; HASWELL: vmovups - ; HASWELL-NEXT: retq - - %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1 - %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2 - %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 - %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 - %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> - %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1) - ret <8 x float> %v3 -} - -; Swap the operands of the shufflevector and vinsertf128 to ensure that the -; pattern still matches. -define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) { - ; CHECK-LABEL: combine_16_byte_loads_swap - - ; SANDYB: vmovups - ; SANDYB-NEXT: vinsertf128 - ; SANDYB-NEXT: retq - - ; BTVER2: vmovups - ; BTVER2-NEXT: retq - - ; HASWELL: vmovups - ; HASWELL-NEXT: retq - - %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2 - %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 - %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 - %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 - %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> - %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0) - ret <8 x float> %v3 -} - -; Replace the vinsertf128 intrinsic with a shufflevector as might be -; expected from auto-vectorized code. define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic