Index: llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -690,6 +690,7 @@ Value *Src = CI.getOperand(0); Type *DestTy = CI.getType(), *SrcTy = Src->getType(); + ConstantInt *Cst = nullptr; // Attempt to truncate the entire input expression tree to the destination // type. Only do this if the dest type is a simple type, don't convert the @@ -758,7 +759,7 @@ // more efficiently. Support vector types. Cleanup code by using m_OneUse. // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion. - Value *A = nullptr; ConstantInt *Cst = nullptr; + Value *A = nullptr; if (Src->hasOneUse() && match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) { // We have three types to worry about here, the type of A, the source of @@ -843,6 +844,37 @@ if (Instruction *I = foldVecTruncToExtElt(CI, *this)) return I; + /// Whenever an element is extracted from a vector, and then truncated, + /// canonicalize by converting it to a bitcast followed by an + /// extractelement. + /// + /// Example (little endian): + /// trunc (extractelement <4 x i64> %X, 0) to i32 + /// ---> + /// extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0 + Value *VecOp = nullptr; + if (match(Src, + m_OneUse(m_ExtractElement(m_Value(VecOp), m_ConstantInt(Cst))))) { + Type *VecOpTy = VecOp->getType(); + unsigned DestScalarSize = DestTy->getScalarSizeInBits(); + unsigned VecOpScalarSize = VecOpTy->getScalarSizeInBits(); + unsigned VecNumElts = VecOpTy->getVectorNumElements(); + + // A badly fit destination size would result in an invalid cast. + if (VecNumElts * VecOpScalarSize % DestScalarSize == 0) { + unsigned BitCastNumElts = VecNumElts * VecOpScalarSize / DestScalarSize; + unsigned VecOpIdx = Cst->getZExtValue(); + unsigned NewIdx = + DL.isBigEndian() + ? (VecOpIdx + 1) * VecOpScalarSize / DestScalarSize - 1 + : VecOpIdx * VecOpScalarSize / DestScalarSize; + + Type *BitCastTo = VectorType::get(DestTy, BitCastNumElts); + Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo); + return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx)); + } + } + return nullptr; } Index: llvm/test/Transforms/InstCombine/pr45314_be.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/pr45314_be.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "E" + +define i32 @shrinkExtractElt_i64_to_i32_0(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_0( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32> +; CHECK-NEXT: [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: ret i32 [[T]] +; + %e = extractelement <3 x i64> %x, i32 0 + %t = trunc i64 %e to i32 + ret i32 %t +} + +define i32 @shrinkExtractElt_i64_to_i32_1(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_1( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32> +; CHECK-NEXT: [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: ret i32 [[T]] +; + %e = extractelement <3 x i64> %x, i32 1 + %t = trunc i64 %e to i32 + ret i32 %t +} + +define i32 @shrinkExtractElt_i64_to_i32_2(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_2( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32> +; CHECK-NEXT: [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 5 +; CHECK-NEXT: ret i32 [[T]] +; + %e = extractelement <3 x i64> %x, i32 2 + %t = trunc i64 %e to i32 + ret i32 %t +} + +define i16 @shrinkExtractElt_i64_to_i16_0(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_0( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> +; CHECK-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i16 0 + %t = trunc i64 %e to i16 + ret i16 %t +} + +define i16 @shrinkExtractElt_i64_to_i16_1(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_1( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> +; CHECK-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i16 1 + %t = trunc i64 %e to i16 + ret i16 %t +} + +define i16 @shrinkExtractElt_i64_to_i16_2(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_2( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> +; CHECK-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 11 +; CHECK-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i16 2 + %t = trunc i64 %e to i16 + ret i16 %t +} + +; Do not optimize if it would result in an invalid bitcast instruction. +define i13 @shrinkExtractElt_i67_to_i13_2(<3 x i67> %x) { +; CHECK-LABEL: @shrinkExtractElt_i67_to_i13_2( +; CHECK-NEXT: [[E:%.*]] = extractelement <3 x i67> [[X:%.*]], i459 2 +; CHECK-NEXT: [[T:%.*]] = trunc i67 [[E]] to i13 +; CHECK-NEXT: ret i13 [[T]] +; + %e = extractelement <3 x i67> %x, i459 2 + %t = trunc i67 %e to i13 + ret i13 %t +} + +; Do not canonicalize if that would increase the instruction count. +declare void @use(i64) +define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_2_extra_use( +; CHECK-NEXT: [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2 +; CHECK-NEXT: call void @use(i64 [[E]]) +; CHECK-NEXT: [[T:%.*]] = trunc i64 [[E]] to i16 +; CHECK-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i64 2 + call void @use(i64 %e) + %t = trunc i64 %e to i16 + ret i16 %t +} Index: llvm/test/Transforms/InstCombine/pr45314_le.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/pr45314_le.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "e" + +define i32 @shrinkExtractElt_i64_to_i32_0(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_0( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32> +; CHECK-NEXT: [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: ret i32 [[T]] +; + %e = extractelement <3 x i64> %x, i32 0 + %t = trunc i64 %e to i32 + ret i32 %t +} + +define i32 @shrinkExtractElt_i64_to_i32_1(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_1( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32> +; CHECK-NEXT: [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: ret i32 [[T]] +; + %e = extractelement <3 x i64> %x, i32 1 + %t = trunc i64 %e to i32 + ret i32 %t +} + +define i32 @shrinkExtractElt_i64_to_i32_2(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_2( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32> +; CHECK-NEXT: [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 4 +; CHECK-NEXT: ret i32 [[T]] +; + %e = extractelement <3 x i64> %x, i32 2 + %t = trunc i64 %e to i32 + ret i32 %t +} + +define i16 @shrinkExtractElt_i64_to_i16_0(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_0( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> +; CHECK-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i16 0 + %t = trunc i64 %e to i16 + ret i16 %t +} + +define i16 @shrinkExtractElt_i64_to_i16_1(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_1( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> +; CHECK-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i16 1 + %t = trunc i64 %e to i16 + ret i16 %t +} + +define i16 @shrinkExtractElt_i64_to_i16_2(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_2( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16> +; CHECK-NEXT: [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 8 +; CHECK-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i16 2 + %t = trunc i64 %e to i16 + ret i16 %t +} + +; Do not optimize if it would result in an invalid bitcast instruction. +define i13 @shrinkExtractElt_i67_to_i13_2(<3 x i67> %x) { +; CHECK-LABEL: @shrinkExtractElt_i67_to_i13_2( +; CHECK-NEXT: [[E:%.*]] = extractelement <3 x i67> [[X:%.*]], i459 2 +; CHECK-NEXT: [[T:%.*]] = trunc i67 [[E]] to i13 +; CHECK-NEXT: ret i13 [[T]] +; + %e = extractelement <3 x i67> %x, i459 2 + %t = trunc i67 %e to i13 + ret i13 %t +} + +; Do not canonicalize if that would increase the instruction count. +declare void @use(i64) +define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) { +; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_2_extra_use( +; CHECK-NEXT: [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2 +; CHECK-NEXT: call void @use(i64 [[E]]) +; CHECK-NEXT: [[T:%.*]] = trunc i64 [[E]] to i16 +; CHECK-NEXT: ret i16 [[T]] +; + %e = extractelement <3 x i64> %x, i64 2 + call void @use(i64 %e) + %t = trunc i64 %e to i16 + ret i16 %t +}