Index: lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp =================================================================== --- lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -503,7 +503,8 @@ int64_t ConstantOffset = ConstantOffsetExtractor::Extract(GEP->getOperand(I), NewIdx, DL, GEP); if (ConstantOffset != 0) { - assert(NewIdx && "ConstantOffset != 0 implies NewIdx is set"); + assert(NewIdx != nullptr && + "ConstantOffset != 0 implies NewIdx is set"); GEP->setOperand(I, NewIdx); // Clear the inbounds attribute because the new index may be off-bound. // e.g., @@ -538,44 +539,65 @@ // => add the offset // // %gep2 ; clone of %gep - // %0 = ptrtoint %gep2 - // %1 = add %0, - // %new.gep = inttoptr %1 + // %new.gep = gep %gep2, // %gep ; will be removed // ... %gep ... // // => replace all uses of %gep with %new.gep and remove %gep // // %gep2 ; clone of %gep - // %0 = ptrtoint %gep2 - // %1 = add %0, - // %new.gep = inttoptr %1 + // %new.gep = gep %gep2, // ... %new.gep ... // - // TODO(jingyue): Emit a GEP instead of an "uglygep" - // (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep) to make the IR - // prettier and more alias analysis friendly. One caveat: if the original GEP - // ends with a StructType, we need to split the GEP at the last - // SequentialType. For instance, consider the following IR: + // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we bitcast + // %gep2 to i8*, add the offset, and bitcast it back to the type of %gep: // - // %struct.S = type { float, double } - // @array = global [1024 x %struct.S] - // %p = getelementptr %array, 0, %i + 5, 1 - // - // To separate the constant 5 from %p, we would need to split %p at the last - // array index so that we have: - // - // %addr = gep %array, 0, %i - // %p = gep %addr, 5, 1 + // %gep2 ; clone of %gep + // %0 = bitcast %gep2 to i8* + // %uglygep = gep %0, + // %new.gep = bitcast %uglygep to + // ... %new.gep ... Instruction *NewGEP = GEP->clone(); NewGEP->insertBefore(GEP); + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); - Value *Addr = new PtrToIntInst(NewGEP, IntPtrTy, "", GEP); - Addr = BinaryOperator::CreateAdd( - Addr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "", GEP); - Addr = new IntToPtrInst(Addr, GEP->getType(), "", GEP); + uint64_t ElementTypeSizeOfGEP = + DL->getTypeAllocSize(GEP->getType()->getElementType()); + if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) { + // Very likely. As long as %gep is natually aligned, the byte offset we + // extracted should be a multiple of sizeof(*%gep). + // Per ANSI C standard, signed / unsigned = unsigned. Therefore, we + // cast ElementTypeSizeOfGEP to signed. + int64_t Index = + AccumulativeByteOffset / static_cast(ElementTypeSizeOfGEP); + NewGEP = GetElementPtrInst::Create( + NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); + } else { + // Unlikely but possible. For example, + // #pragma pack(1) + // struct S { + // int a[3]; + // int64 b[8]; + // }; + // #pragma pack() + // + // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After + // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is + // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of + // sizeof(int64). + // + // Emit an uglygep in this case. + Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(), + GEP->getPointerAddressSpace()); + NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP); + NewGEP = GetElementPtrInst::Create( + NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), + "uglygep", GEP); + if (GEP->getType() != I8PtrTy) + NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); + } - GEP->replaceAllUsesWith(Addr); + GEP->replaceAllUsesWith(NewGEP); GEP->eraseFromParent(); return true; Index: test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll =================================================================== --- test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll +++ test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll @@ -54,7 +54,6 @@ ; IR-LABEL: @sum_of_array( ; IR: [[BASE_PTR:%[0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i32 %x, i32 %y -; IR: [[BASE_INT:%[0-9]+]] = ptrtoint float addrspace(3)* [[BASE_PTR]] to i64 -; IR: %5 = add i64 [[BASE_INT]], 4 -; IR: %10 = add i64 [[BASE_INT]], 128 -; IR: %15 = add i64 [[BASE_INT]], 132 +; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1 +; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32 +; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33 Index: test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll =================================================================== --- test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll +++ test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll @@ -39,7 +39,7 @@ } ; CHECK-LABEL: @sext_zext ; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i32 %i, i32 %j -; CHECK: add i64 %{{[0-9]+}}, 136 +; CHECK: getelementptr float* %{{[0-9]+}}, i64 34 ; We should be able to trace into sext/zext if it can be distributed to both ; operands, e.g., sext (add nsw a, b) == add nsw (sext a), (sext b) @@ -55,8 +55,7 @@ } ; CHECK-LABEL: @ext_add_no_overflow ; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[0-9]+}}, i64 %{{[0-9]+}} -; CHECK: [[BASE_INT:%[0-9]+]] = ptrtoint float* [[BASE_PTR]] to i64 -; CHECK: add i64 [[BASE_INT]], 132 +; CHECK: getelementptr float* [[BASE_PTR]], i64 33 ; Similar to @ext_add_no_overflow, we should be able to trace into sext/zext if ; its operand is an "or" instruction. @@ -74,8 +73,7 @@ } ; CHECK-LABEL: @ext_or ; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[0-9]+}}, i64 %{{[0-9]+}} -; CHECK: [[BASE_INT:%[0-9]+]] = ptrtoint float* [[BASE_PTR]] to i64 -; CHECK: add i64 [[BASE_INT]], 136 +; CHECK: getelementptr float* [[BASE_PTR]], i64 34 ; We should treat "or" with no common bits (%k) as "add", and leave "or" with ; potentially common bits (%l) as is. @@ -88,8 +86,8 @@ ret float* %p } ; CHECK-LABEL: @or -; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %j, i64 %l -; CHECK: add i64 %{{[0-9]+}}, 384 +; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %j, i64 %l +; CHECK: getelementptr float* [[BASE_PTR]], i64 96 ; The subexpression (b + 5) is used in both "i = a + (b + 5)" and "*out = b + ; 5". When extracting the constant offset 5, make sure "*out = b + 5" isn't @@ -103,8 +101,8 @@ ret float* %p } ; CHECK-LABEL: @expr -; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %0, i64 0 -; CHECK: add i64 %{{[0-9]+}}, 640 +; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %0, i64 0 +; CHECK: getelementptr float* [[BASE_PTR]], i64 160 ; CHECK: store i64 %b5, i64* %out ; Verifies we handle "sub" correctly. @@ -116,5 +114,24 @@ } ; CHECK-LABEL: @sub ; CHECK: %[[j2:[0-9]+]] = sub i64 0, %j -; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %[[j2]] -; CHECK: add i64 %{{[0-9]+}}, -620 +; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %[[j2]] +; CHECK: getelementptr float* [[BASE_PTR]], i64 -155 + +%struct.Packed = type <{ [3 x i32], [8 x i64] }> ; <> means packed + +; Verifies we can emit correct uglygep if the address is not natually aligned. +define i64* @packed_struct(i32 %i, i32 %j) { +entry: + %s = alloca [1024 x %struct.Packed], align 16 + %add = add nsw i32 %j, 3 + %idxprom = sext i32 %add to i64 + %add1 = add nsw i32 %i, 1 + %idxprom2 = sext i32 %add1 to i64 + %arrayidx3 = getelementptr inbounds [1024 x %struct.Packed]* %s, i64 0, i64 %idxprom2, i32 1, i64 %idxprom + ret i64* %arrayidx3 +} +; CHECK-LABEL: @packed_struct +; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [1024 x %struct.Packed]* %s, i64 0, i32 %i, i32 1, i32 %j +; CHECK: [[CASTED_PTR:%[0-9]+]] = bitcast i64* [[BASE_PTR]] to i8* +; CHECK: %uglygep = getelementptr i8* [[CASTED_PTR]], i64 100 +; CHECK: bitcast i8* %uglygep to i64*