Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -197,12 +197,51 @@ return nullptr; } +static Value *SimplifyX86insertps(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + if (auto *CInt = dyn_cast(II.getArgOperand(2))) { + VectorType *VecTy = cast(II.getType()); + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); + + // The immediate permute control byte looks like this: + // [3:0] - zero mask for each 32-bit lane + // [5:4] - select one 32-bit destination lane + // [7:6] - select one 32-bit source lane + + uint8_t Imm = CInt->getZExtValue(); + uint8_t ZMask = Imm & 0xf; + uint8_t DestLane = (Imm >> 4) & 0x3; + uint8_t SourceLane = (Imm >> 6) & 0x3; + + // If all zero mask bits are set, this was just a weird way to + // generate a zero vector. + if (ZMask == 0xf) + return ZeroVector; + + // TODO: Model this case as two shuffles or a 'logical and' plus shuffle? + if (ZMask) + return nullptr; + + assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); + + // If we're not zeroing anything, this is a single shuffle. + // Replace the selected destination lane with the selected source lane. + // For all other lanes, pass the first source bits through. + int ShuffleMask[4] = { 0, 1, 2, 3 }; + ShuffleMask[DestLane] = SourceLane + 4; + + return Builder.CreateShuffleVector(II.getArgOperand(0), II.getArgOperand(1), + ShuffleMask); + } + return nullptr; +} + /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit /// source vectors, unless a zero bit is set. If a zero bit is set, /// then ignore that half of the mask and clear that half of the vector. static Value *SimplifyX86vperm2(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { - if (auto CInt = dyn_cast(II.getArgOperand(2))) { + if (auto *CInt = dyn_cast(II.getArgOperand(2))) { VectorType *VecTy = cast(II.getType()); ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); @@ -730,7 +769,11 @@ } break; } - + case Intrinsic::x86_sse41_insertps: + if (Value *V = SimplifyX86insertps(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + case Intrinsic::x86_sse4a_insertqi: { // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top // ones undef Index: llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll @@ -0,0 +1,117 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone + +; This should never happen, but make sure we don't crash handling a non-constant immediate byte. + +define <4 x float> @insertps_non_const_imm(<4 x float> %v1, <4 x float> %v2, i8 %c) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_non_const_imm +; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 %c) +; CHECK-NEXT: ret <4 x float> +} + +; If all zero mask bits are set, return a zero regardless of the other control bits. + +define <4 x float> @insertps_0x0f(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 15) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x0f +; CHECK-NEXT: ret <4 x float> zeroinitializer +} +define <4 x float> @insertps_0xff(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 255) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xff +; CHECK-NEXT: ret <4 x float> zeroinitializer +} + +; If some zero mask bits are set, we do not change anything. + +define <4 x float> @insertps_0x03(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x03 +; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3) +; CHECK-NEXT: ret <4 x float> +} + +; If no zero mask bits are set, convert to a shuffle. + +define <4 x float> @insertps_0x00(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 0) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x00 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0x10(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 16) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x10 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0x20(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 32) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x20 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0x30(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 48) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x30 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0xc0(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 192) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xc0 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0xd0(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 208) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xd0 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0xe0(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 224) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xe0 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> +} + +define <4 x float> @insertps_0xf0(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 240) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xf0 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> %v2, <4 x i32> +; CHECK-NEXT: ret <4 x float> +} +