Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -204,7 +204,7 @@ InstCombiner::BuilderTy &Builder) { if (auto CInt = dyn_cast(II.getArgOperand(2))) { VectorType *VecTy = cast(II.getType()); - uint8_t Imm = CInt->getZExtValue(); + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); // The immediate permute control byte looks like this: // [1:0] - select 128 bits from sources for low half of destination @@ -213,37 +213,64 @@ // [5:4] - select 128 bits from sources for high half of destination // [6] - ignore // [7] - zero high half of destination - - if ((Imm & 0x88) == 0x88) { - // If both zero mask bits are set, this was just a weird way to - // generate a zero vector. - return ConstantAggregateZero::get(VecTy); - } - - // TODO: If a single zero bit is set, replace one of the source operands - // with a zero vector and use the same mask generation logic as below. - - if ((Imm & 0x88) == 0x00) { - // If neither zero mask bit is set, this is a simple shuffle. - unsigned NumElts = VecTy->getNumElements(); - unsigned HalfSize = NumElts / 2; - unsigned HalfBegin; - SmallVector ShuffleMask(NumElts); - - // Permute low half of result. - HalfBegin = (Imm & 0x3) * HalfSize; - for (unsigned i = 0; i != HalfSize; ++i) - ShuffleMask[i] = HalfBegin + i; - - // Permute high half of result. - HalfBegin = ((Imm >> 4) & 0x3) * HalfSize; - for (unsigned i = HalfSize; i != NumElts; ++i) - ShuffleMask[i] = HalfBegin + i - HalfSize; - - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - return Builder.CreateShuffleVector(Op0, Op1, ShuffleMask); - } + + uint8_t Imm = CInt->getZExtValue(); + bool LowHalfZero = Imm & 0x08; + bool HighHalfZero = Imm & 0x80; + uint8_t LowHalfSelect = Imm & 0x3; + uint8_t HighHalfSelect = (Imm >> 4) & 0x3; + + // If both zero mask bits are set, this was just a weird way to + // generate a zero vector. + if (LowHalfZero && HighHalfZero) + return ZeroVector; + + // If no zero mask bits are set, this is a simple shuffle. If one zero + // mask bit is set, we will do some sneaky logic to turn this into one + // simple shuffle. + unsigned NumElts = VecTy->getNumElements(); + unsigned HalfSize = NumElts / 2; + unsigned HalfBegin; + SmallVector ShuffleMask(NumElts); + + Value *Op0 = II.getArgOperand(0); + Value *Op1 = II.getArgOperand(1); + + // If exactly one zero mask bit is set, determine which one of the source + // vectors may be discarded by looking at the opposite half's select bits. + // + // 1. If the opposite half select bits equal 0x2 or 0x3, then the second + // source vector (Operand 1) is in use. + // + // 2. If the opposite half select bits equal 0x0 or 0x1, then the first + // source vector (Operand 0) is in use. + // + // 3. Replace the unused source operand with the zero vector. + // + // 4. Last, overwrite the selection mask on the zeroing side to select + // the zero vector. The low half of the zero vector (0x0 or 0x2) is + // chosen arbitrarily. + + if (LowHalfZero ^ HighHalfZero) { + uint8_t &OtherHalfSelect = LowHalfZero ? HighHalfSelect : LowHalfSelect; + bool Op1Used = OtherHalfSelect & 0x2; + Op0 = Op1Used ? ZeroVector : Op0; + Op1 = Op1Used ? Op1 : ZeroVector; + uint8_t &ZeroHalfSelect = LowHalfZero ? LowHalfSelect : HighHalfSelect; + ZeroHalfSelect = Op1Used ? 0x0 : 0x2; + } + + // Permute low half of result. + HalfBegin = LowHalfSelect * HalfSize; + for (unsigned i = 0; i != HalfSize; ++i) + ShuffleMask[i] = HalfBegin + i; + + // Permute high half of result. + HalfBegin = HighHalfSelect * HalfSize; + for (unsigned i = HalfSize; i != NumElts; ++i) + ShuffleMask[i] = HalfBegin + i - HalfSize; + + return Builder.CreateShuffleVector(Op0, Op1, ShuffleMask); } return nullptr; } Index: test/Transforms/InstCombine/x86-vperm2.ll =================================================================== --- test/Transforms/InstCombine/x86-vperm2.ll +++ test/Transforms/InstCombine/x86-vperm2.ll @@ -207,26 +207,41 @@ } -; Confirm that when a single zero mask bit is set, we do nothing. +; Confirm that when a single zero mask bit is set, we replace a source vector with zeros. + +define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x81 +; CHECK-NEXT: shufflevector <4 x double> %a0, <4 x double> +; CHECK-NEXT: ret <4 x double> +} define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) { %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131) ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x83 -; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 -125) +; CHECK-NEXT: shufflevector <4 x double> %a1, <4 x i32> ; CHECK-NEXT: ret <4 x double> } +define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40) + ret <4 x double> %res -; Confirm that when the other zero mask bit is set, we do nothing. Also confirm that an ignored bit has no effect. +; CHECK-LABEL: @perm2pd_0x28 +; CHECK-NEXT: shufflevector <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> +} -define <4 x double> @perm2pd_0x48(<4 x double> %a0, <4 x double> %a1) { - %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72) +define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8) ret <4 x double> %res -; CHECK-LABEL: @perm2pd_0x48 -; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72) +; CHECK-LABEL: @perm2pd_0x08 +; CHECK-NEXT: shufflevector <4 x double> %a0, <4 x double> ; CHECK-NEXT: ret <4 x double> }