Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -204,7 +204,7 @@ InstCombiner::BuilderTy &Builder) { if (auto CInt = dyn_cast(II.getArgOperand(2))) { VectorType *VecTy = cast(II.getType()); - uint8_t Imm = CInt->getZExtValue(); + ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); // The immediate permute control byte looks like this: // [1:0] - select 128 bits from sources for low half of destination @@ -213,37 +213,51 @@ // [5:4] - select 128 bits from sources for high half of destination // [6] - ignore // [7] - zero high half of destination + + uint8_t Imm = CInt->getZExtValue(); + + bool LowHalfZero = Imm & 0x08; + bool HighHalfZero = Imm & 0x80; + + // If both zero mask bits are set, this was just a weird way to + // generate a zero vector. + if (LowHalfZero && HighHalfZero) + return ZeroVector; + + // If 0 or 1 zero mask bits are set, this is a simple shuffle. + unsigned NumElts = VecTy->getNumElements(); + unsigned HalfSize = NumElts / 2; + SmallVector ShuffleMask(NumElts); + + // The high bit of the selection field chooses the 1st or 2nd operand. + bool LowInputSelect = Imm & 0x02; + bool HighInputSelect = Imm & 0x20; - if ((Imm & 0x88) == 0x88) { - // If both zero mask bits are set, this was just a weird way to - // generate a zero vector. - return ConstantAggregateZero::get(VecTy); - } - - // TODO: If a single zero bit is set, replace one of the source operands - // with a zero vector and use the same mask generation logic as below. - - if ((Imm & 0x88) == 0x00) { - // If neither zero mask bit is set, this is a simple shuffle. - unsigned NumElts = VecTy->getNumElements(); - unsigned HalfSize = NumElts / 2; - unsigned HalfBegin; - SmallVector ShuffleMask(NumElts); - - // Permute low half of result. - HalfBegin = (Imm & 0x3) * HalfSize; - for (unsigned i = 0; i != HalfSize; ++i) - ShuffleMask[i] = HalfBegin + i; + // The low bit of the selection field chooses the low or high half + // of the selected operand. + bool LowHalfSelect = Imm & 0x01; + bool HighHalfSelect = Imm & 0x10; + + // Determine which operand(s) are actually in use for this instruction. + Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); + Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); - // Permute high half of result. - HalfBegin = ((Imm >> 4) & 0x3) * HalfSize; - for (unsigned i = HalfSize; i != NumElts; ++i) - ShuffleMask[i] = HalfBegin + i - HalfSize; - - Value *Op0 = II.getArgOperand(0); - Value *Op1 = II.getArgOperand(1); - return Builder.CreateShuffleVector(Op0, Op1, ShuffleMask); - } + // If needed, replace operands based on zero mask. + V0 = LowHalfZero ? ZeroVector : V0; + V1 = HighHalfZero ? ZeroVector : V1; + + // Permute low half of result. + unsigned StartIndex = LowHalfSelect ? HalfSize : 0; + for (unsigned i = 0; i < HalfSize; ++i) + ShuffleMask[i] = StartIndex + i; + + // Permute high half of result. + StartIndex = HighHalfSelect ? HalfSize : 0; + StartIndex += NumElts; + for (unsigned i = 0; i < HalfSize; ++i) + ShuffleMask[i + HalfSize] = StartIndex + i; + + return Builder.CreateShuffleVector(V0, V1, ShuffleMask); } return nullptr; } Index: llvm/trunk/test/Transforms/InstCombine/x86-vperm2.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-vperm2.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-vperm2.ll @@ -76,7 +76,7 @@ ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x02 -; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> ; CHECK-NEXT: ret <4 x double> %1 } @@ -85,7 +85,7 @@ ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x03 -; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> ; CHECK-NEXT: ret <4 x double> %1 } @@ -111,7 +111,7 @@ ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x12 -; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> ; CHECK-NEXT: ret <4 x double> %1 } @@ -120,7 +120,7 @@ ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x13 -; CHECK-NEXT: %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> +; CHECK-NEXT: %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> ; CHECK-NEXT: ret <4 x double> %1 } @@ -207,26 +207,41 @@ } -; Confirm that when a single zero mask bit is set, we do nothing. +; Confirm that when a single zero mask bit is set, we replace a source vector with zeros. + +define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129) + ret <4 x double> %res + +; CHECK-LABEL: @perm2pd_0x81 +; CHECK-NEXT: shufflevector <4 x double> %a0, <4 x double> +; CHECK-NEXT: ret <4 x double> +} define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) { %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131) ret <4 x double> %res ; CHECK-LABEL: @perm2pd_0x83 -; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 -125) +; CHECK-NEXT: shufflevector <4 x double> %a1, <4 x double> ; CHECK-NEXT: ret <4 x double> } +define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40) + ret <4 x double> %res -; Confirm that when the other zero mask bit is set, we do nothing. Also confirm that an ignored bit has no effect. +; CHECK-LABEL: @perm2pd_0x28 +; CHECK-NEXT: shufflevector <4 x double> %a1, <4 x i32> +; CHECK-NEXT: ret <4 x double> +} -define <4 x double> @perm2pd_0x48(<4 x double> %a0, <4 x double> %a1) { - %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72) +define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) { + %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8) ret <4 x double> %res -; CHECK-LABEL: @perm2pd_0x48 -; CHECK-NEXT: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 72) +; CHECK-LABEL: @perm2pd_0x08 +; CHECK-NEXT: shufflevector <4 x double> %a0, <4 x i32> ; CHECK-NEXT: ret <4 x double> }