diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -99,10 +99,12 @@ Value *StorePtr = Store->getPointerOperand(); Type *LoadType = getLoadStoreType(Load); - assert(LoadPtr->getType()->getPointerAddressSpace() == - StorePtr->getType()->getPointerAddressSpace() && - LoadType == getLoadStoreType(Store) && - "Should be a known dependence"); + assert( + LoadPtr->getType()->getPointerAddressSpace() == + StorePtr->getType()->getPointerAddressSpace() && + LoadType->getPrimitiveSizeInBits().getFixedSize() == + getLoadStoreType(Store)->getPrimitiveSizeInBits().getFixedSize() && + "Should be a known dependence"); // Currently we only support accesses with unit stride. FIXME: we should be // able to handle non unit stirde as well as long as the stride is equal to @@ -211,9 +213,9 @@ if (!Load) continue; - // Only progagate the value if they are of the same type. - if (Store->getPointerOperandType() != Load->getPointerOperandType() || - getLoadStoreType(Store) != getLoadStoreType(Load)) + // Only progagate the value if the pointees are the same size. + if (getLoadStoreType(Store)->getPrimitiveSizeInBits().getFixedSize() != + getLoadStoreType(Load)->getPrimitiveSizeInBits().getFixedSize()) continue; Candidates.emplace_front(Load, Store); @@ -438,7 +440,26 @@ PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", &L->getHeader()->front()); PHI->addIncoming(Initial, PH); - PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch()); + + Value *StoreValue; + + Type *LoadType = Initial->getType(); + Type *StoreType = Cand.Store->getOperand(0)->getType(); + + assert(LoadType->getPrimitiveSizeInBits().getFixedSize() == + StoreType->getPrimitiveSizeInBits().getFixedSize() && + "The type sizes should match!"); + + if ((LoadType != StoreType) && + (CastInst::castIsValid(Instruction::BitCast, StoreType, LoadType))) { + // Need a bitcast to convert to the loaded type + StoreValue = + CastInst::Create(Instruction::BitCast, Cand.Store->getOperand(0), + LoadType, "store_forward_cast", Cand.Store); + } else + StoreValue = Cand.Store->getOperand(0); + + PHI->addIncoming(StoreValue, L->getLoopLatch()); Cand.Load->replaceAllUsesWith(PHI); } diff --git a/llvm/test/Transforms/EarlyCSE/vector_bitcasting_be.ll b/llvm/test/Transforms/EarlyCSE/vector_bitcasting_be.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/EarlyCSE/vector_bitcasting_be.ll @@ -0,0 +1,32 @@ +; RUN: opt --opaque-pointers -O3 -S < %s | FileCheck %s + +target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @f1( +; CHECK-NEXT: ret <1 x i1> +define <1 x i1> @f1() { + + %val1 = alloca <1 x i32>, align 4 + store i32 3, ptr %val1, align 4 + %val2 = load <2 x i16>, ptr %val1, align 4 + %val3 = load <1 x i32>, ptr %val1, align 4 + %val4 = bitcast <2 x i16> %val2 to <1 x i32> + %cmp = icmp eq <1 x i32> %val3, %val4 + + ret <1 x i1> %cmp +} + +; CHECK-LABEL: @f2( +; CHECK-NEXT: ret i1 true +define i1 @f2() { + + %val1 = alloca <2 x i32>, align 8 + store <2 x i32> , ptr %val1, align 8 + %val2 = load <4 x i16>, ptr %val1, align 8 + %val3 = load <2 x i32>, ptr %val1, align 8 + %val4 = bitcast <4 x i16> %val2 to i64 + %val5 = bitcast <2 x i32> %val3 to i64 + %cmp = icmp eq i64 %val4, %val5 + + ret i1 %cmp +} diff --git a/llvm/test/Transforms/EarlyCSE/vector_bitcasting_le.ll b/llvm/test/Transforms/EarlyCSE/vector_bitcasting_le.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/EarlyCSE/vector_bitcasting_le.ll @@ -0,0 +1,32 @@ +; RUN: opt --opaque-pointers -O3 -S < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: @f1( +; CHECK-NEXT: ret <1 x i1> +define <1 x i1> @f1() { + + %val1 = alloca <1 x i32>, align 4 + store i32 3, ptr %val1, align 4 + %val2 = load <2 x i16>, ptr %val1, align 4 + %val3 = load <1 x i32>, ptr %val1, align 4 + %val4 = bitcast <2 x i16> %val2 to <1 x i32> + %cmp = icmp eq <1 x i32> %val3, %val4 + + ret <1 x i1> %cmp +} + +; CHECK-LABEL: @f2( +; CHECK-NEXT: ret i1 true +define i1 @f2() { + + %val1 = alloca <2 x i32>, align 8 + store <2 x i32> , ptr %val1, align 8 + %val2 = load <4 x i16>, ptr %val1, align 8 + %val3 = load <2 x i32>, ptr %val1, align 8 + %val4 = bitcast <4 x i16> %val2 to i64 + %val5 = bitcast <2 x i32> %val3 to i64 + %cmp = icmp eq i64 %val4, %val5 + + ret i1 %cmp +} diff --git a/llvm/test/Transforms/LoopLoadElim/type-mismatch-opaque-ptr.ll b/llvm/test/Transforms/LoopLoadElim/type-mismatch-opaque-ptr.ll --- a/llvm/test/Transforms/LoopLoadElim/type-mismatch-opaque-ptr.ll +++ b/llvm/test/Transforms/LoopLoadElim/type-mismatch-opaque-ptr.ll @@ -1,6 +1,7 @@ ; RUN: opt --opaque-pointers -loop-load-elim -S < %s | FileCheck %s -; Don't crash if the store and the load use different types. +; If the store and the load use different types, but have the same +; size then we should still be able to forward the value. ; ; for (unsigned i = 0; i < 100; i++) { ; A[i+1] = B[i] + 2; @@ -30,7 +31,7 @@ ; CHECK: %a = load float, ptr %Aidx, align 4 %a = load float, ptr %Aidx, align 4 -; CHECK-NEXT: %c = fmul float %a, 2.0 +; CHECK-NEXT: %c = fmul float %store_forwarded, 2.0 %c = fmul float %a, 2.0 %c.int = fptosi float %c to i32 store i32 %c.int, ptr %Cidx, align 4 @@ -42,7 +43,8 @@ ret void } -; Don't crash if the store and the load use different types. +; If the store and the load use different types, but have the same +; size then we should still be able to forward the value. ; ; for (unsigned i = 0; i < 100; i++) { ; A[i+1] = B[i] + 2; @@ -74,7 +76,7 @@ ; CHECK: %a = load float, ptr %Aidx, align 4 %a = load float, ptr %Aidx, align 4 -; CHECK-NEXT: %c = fmul float %a, 2.0 +; CHECK-NEXT: %c = fmul float %store_forwarded, 2.0 %c = fmul float %a, 2.0 %c.int = fptosi float %c to i32 store i32 %c.int, ptr %Cidx, align 4 diff --git a/llvm/test/Transforms/LoopLoadElim/type-mismatch.ll b/llvm/test/Transforms/LoopLoadElim/type-mismatch.ll --- a/llvm/test/Transforms/LoopLoadElim/type-mismatch.ll +++ b/llvm/test/Transforms/LoopLoadElim/type-mismatch.ll @@ -1,6 +1,7 @@ ; RUN: opt -loop-load-elim -S < %s | FileCheck %s -; Don't crash if the store and the load use different types. +; If the store and the load use different types, but have the same +; size then we should still be able to forward the value. ; ; for (unsigned i = 0; i < 100; i++) { ; A[i+1] = B[i] + 2; @@ -27,11 +28,12 @@ %b = load i32, i32* %Bidx, align 4 %a_p1 = add i32 %b, 2 +; CHECK: %store_forward_cast = bitcast i32 %a_p1 to float store i32 %a_p1, i32* %Aidx_next, align 4 ; CHECK: %a = load float, float* %Aidx.float, align 4 %a = load float, float* %Aidx.float, align 4 -; CHECK-NEXT: %c = fmul float %a, 2.0 +; CHECK-NEXT: %c = fmul float %store_forwarded, 2.0 %c = fmul float %a, 2.0 %c.int = fptosi float %c to i32 store i32 %c.int, i32* %Cidx, align 4 @@ -43,7 +45,8 @@ ret void } -; Don't crash if the store and the load use different types. +; If the store and the load use different types, but have the same +; size then we should still be able to forward the value. ; ; for (unsigned i = 0; i < 100; i++) { ; A[i+1] = B[i] + 2; @@ -69,6 +72,7 @@ %b = load i32, i32* %Bidx, align 4 %a_p2 = add i32 %b, 2 +; CHECK: %store_forward_cast = bitcast i32 %a_p3 to float store i32 %a_p2, i32* %Aidx_next, align 4 %a_p3 = add i32 %b, 3 @@ -76,7 +80,7 @@ ; CHECK: %a = load float, float* %Aidx.float, align 4 %a = load float, float* %Aidx.float, align 4 -; CHECK-NEXT: %c = fmul float %a, 2.0 +; CHECK-NEXT: %c = fmul float %store_forwarded, 2.0 %c = fmul float %a, 2.0 %c.int = fptosi float %c to i32 store i32 %c.int, i32* %Cidx, align 4 @@ -87,3 +91,82 @@ for.end: ; preds = %for.body ret void } + +; Check that we don't forward between pointer-sized integers and actual +; pointers; We could potentially do this in future but in IR pointers are +; considered to have a size of 0. +; CHECK-LABEL: @f3( +; CHECK-NOT: store_forwarded +define void @f3(i64* noalias %A, i64* noalias %B, i64* noalias %C, i64 %N) { + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + + %Aidx_next = getelementptr inbounds i64, i64* %A, i64 %indvars.iv.next + %Bidx = getelementptr inbounds i64, i64* %B, i64 %indvars.iv + %Cidx = getelementptr inbounds i64, i64* %C, i64 %indvars.iv + %Aidx = getelementptr inbounds i64, i64* %A, i64 %indvars.iv + %Aidx.i8p = bitcast i64* %Aidx to i8** + + %b = load i64, i64* %Bidx, align 8 + %a_p1 = add i64 %b, 2 + store i64 %a_p1, i64* %Aidx_next, align 8 + + %a = load i8*, i8** %Aidx.i8p, align 8 + %c = getelementptr i8, i8* %a, i64 57 + %c.i64p = ptrtoint i8* %c to i64 + store i64 %c.i64p, i64* %Cidx, align 8 + + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; If the store and the load use different types, but have the same +; size then we should still be able to forward the value--also for +; vector types. +; +; for (unsigned i = 0; i < 100; i++) { +; A[i+1] = B[i] + 2; +; C[i] = ((float*)A)[i] * 2; +; } +; CHECK-LABEL: @f4( +define void @f4(i32* noalias %A, i32* noalias %B, i32* noalias %C, i64 %N) { + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + + %Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next + %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %Aidx.float = bitcast i32* %Aidx to <2 x half>* + + %b = load i32, i32* %Bidx, align 4 + %a_p1 = add i32 %b, 2 +; CHECK: %store_forward_cast = bitcast i32 %a_p1 to <2 x half> + store i32 %a_p1, i32* %Aidx_next, align 4 + +; CHECK: %a = load <2 x half>, <2 x half>* %Aidx.float, align 4 + %a = load <2 x half>, <2 x half>* %Aidx.float, align 4 +; CHECK-NEXT: %c = fmul <2 x half> %store_forwarded, + %c = fmul <2 x half> %a, + %c.int = bitcast <2 x half> %c to i32 + store i32 %c.int, i32* %Cidx, align 4 + + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +}