Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -201,7 +201,7 @@ InstCombiner::BuilderTy &Builder) { if (auto *CInt = dyn_cast(II.getArgOperand(2))) { VectorType *VecTy = cast(II.getType()); - ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); + assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); // The immediate permute control byte looks like this: // [3:0] - zero mask for each 32-bit lane @@ -213,25 +213,42 @@ uint8_t DestLane = (Imm >> 4) & 0x3; uint8_t SourceLane = (Imm >> 6) & 0x3; + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); + // If all zero mask bits are set, this was just a weird way to // generate a zero vector. if (ZMask == 0xf) return ZeroVector; - - // TODO: Model this case as two shuffles or a 'logical and' plus shuffle? - if (ZMask) - return nullptr; - - assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); - // If we're not zeroing anything, this is a single shuffle. - // Replace the selected destination lane with the selected source lane. - // For all other lanes, pass the first source bits through. + // Initialize by passing all of the first source bits through. int ShuffleMask[4] = { 0, 1, 2, 3 }; - ShuffleMask[DestLane] = SourceLane + 4; - - return Builder.CreateShuffleVector(II.getArgOperand(0), II.getArgOperand(1), - ShuffleMask); + + // We may replace the second operand with the zero vector. + Value *V1 = II.getArgOperand(1); + + if (ZMask) { + // If the zero mask is being used with a single input or the zero mask + // overrides the destination lane, this is a shuffle with the zero vector. + if ((II.getArgOperand(0) == II.getArgOperand(1)) || + (ZMask & (1 << DestLane))) { + V1 = ZeroVector; + // We may still move 32-bits of the first source vector from one lane + // to another. + ShuffleMask[DestLane] = SourceLane; + // The zero mask may override the previous insert operation. + for (unsigned i = 0; i < 4; ++i) + if ((ZMask >> i) & 0x1) + ShuffleMask[i] = i + 4; + } else { + // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? + return nullptr; + } + } else { + // Replace the selected destination lane with the selected source lane. + ShuffleMask[DestLane] = SourceLane + 4; + } + + return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); } return nullptr; } Index: llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-insertps.ll @@ -30,14 +30,47 @@ ; CHECK-NEXT: ret <4 x float> zeroinitializer } -; If some zero mask bits are set, we do not change anything. +; If some zero mask bits are set that do not override the insertion, we do not change anything. -define <4 x float> @insertps_0x03(<4 x float> %v1, <4 x float> %v2) { - %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3) +define <4 x float> @insertps_0x0c(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12) ret <4 x float> %res -; CHECK-LABEL: @insertps_0x03 -; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 3) +; CHECK-LABEL: @insertps_0x0c +; CHECK-NEXT: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 12) +; CHECK-NEXT: ret <4 x float> +} + +; ...unless both input vectors are the same operand. + +define <4 x float> @insertps_0x15_single_input(<4 x float> %v1) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 21) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x15_single_input +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> , <4 x i32> +; CHECK-NEXT: ret <4 x float> +} + +; The zero mask overrides the insertion lane. + +define <4 x float> @insertps_0x1a_single_input(<4 x float> %v1) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v1, i8 26) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0x1a_single_input +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> , <4 x i32> +; CHECK-NEXT: ret <4 x float> +} + +; The zero mask overrides the insertion lane, so the second input vector is not used. + +define <4 x float> @insertps_0xc1(<4 x float> %v1, <4 x float> %v2) { + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v1, <4 x float> %v2, i8 193) + ret <4 x float> %res + +; CHECK-LABEL: @insertps_0xc1 +; CHECK-NEXT: shufflevector <4 x float> %v1, <4 x float> , <4 x i32> ; CHECK-NEXT: ret <4 x float> }