Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -939,6 +939,15 @@ return false; } +static bool isBswapIntrinsicCall(const Value *V) { + if (const Instruction *I = dyn_cast(V)) + if (auto *CI = dyn_cast(I)) + if (auto *F = CI->getCalledFunction()) + if (F->getIntrinsicID() == Intrinsic::bswap) + return true; + return false; +} + int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I) { @@ -975,6 +984,22 @@ unsigned NumOps = (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src)); + // Store/Load reversed saves one instruction. + if (!Src->isVectorTy() && NumOps == 1 && I != nullptr) { + if (Opcode == Instruction::Load && I->hasOneUse()) { + const Instruction *LdUser = cast(*I->user_begin()); + // In case of load -> bswap -> store, return normal cost for the load. + if (isBswapIntrinsicCall(LdUser) && + (!LdUser->hasOneUse() || !isa(*LdUser->user_begin()))) + return 0; + } + else if (const StoreInst *SI = dyn_cast(I)) { + const Value *StoredVal = SI->getValueOperand(); + if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal)) + return 0; + } + } + if (Src->getScalarSizeInBits() == 128) // 128 bit scalars are held in a pair of two 64 bit registers. NumOps *= 2; Index: test/Analysis/CostModel/SystemZ/intrinsics.ll =================================================================== --- test/Analysis/CostModel/SystemZ/intrinsics.ll +++ test/Analysis/CostModel/SystemZ/intrinsics.ll @@ -40,6 +40,73 @@ ret void } +; Test that store/load reversed is reflected in costs. +define void @bswap_i64_mem(i64* %src, i64 %arg, i64* %dst) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i64_mem': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld1 = load i64, i64* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i64 @llvm.bswap.i64(i64 %Ld1) +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call i64 @llvm.bswap.i64(i64 %arg) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i64 %swp2, i64* %dst +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld2 = load i64, i64* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp3 = tail call i64 @llvm.bswap.i64(i64 %Ld2) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i64 %swp3, i64* %dst + %Ld1 = load i64, i64* %src + %swp1 = tail call i64 @llvm.bswap.i64(i64 %Ld1) + + %swp2 = tail call i64 @llvm.bswap.i64(i64 %arg) + store i64 %swp2, i64* %dst + + %Ld2 = load i64, i64* %src + %swp3 = tail call i64 @llvm.bswap.i64(i64 %Ld2) + store i64 %swp3, i64* %dst + + ret void +} + +define void @bswap_i32_mem(i32* %src, i32 %arg, i32* %dst) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i32_mem': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld1 = load i32, i32* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i32 @llvm.bswap.i32(i32 %Ld1) +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call i32 @llvm.bswap.i32(i32 %arg) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i32 %swp2, i32* %dst +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld2 = load i32, i32* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp3 = tail call i32 @llvm.bswap.i32(i32 %Ld2) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i32 %swp3, i32* %dst + %Ld1 = load i32, i32* %src + %swp1 = tail call i32 @llvm.bswap.i32(i32 %Ld1) + + %swp2 = tail call i32 @llvm.bswap.i32(i32 %arg) + store i32 %swp2, i32* %dst + + %Ld2 = load i32, i32* %src + %swp3 = tail call i32 @llvm.bswap.i32(i32 %Ld2) + store i32 %swp3, i32* %dst + + ret void +} + +define void @bswap_i16_mem(i16* %src, i16 %arg, i16* %dst) { +; CHECK: Printing analysis 'Cost Model Analysis' for function 'bswap_i16_mem': +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %Ld1 = load i16, i16* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp1 = tail call i16 @llvm.bswap.i16(i16 %Ld1) +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp2 = tail call i16 @llvm.bswap.i16(i16 %arg) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i16 %swp2, i16* %dst +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %Ld2 = load i16, i16* %src +; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %swp3 = tail call i16 @llvm.bswap.i16(i16 %Ld2) +; CHECK: Cost Model: Found an estimated cost of 0 for instruction: store i16 %swp3, i16* %dst + %Ld1 = load i16, i16* %src + %swp1 = tail call i16 @llvm.bswap.i16(i16 %Ld1) + + %swp2 = tail call i16 @llvm.bswap.i16(i16 %arg) + store i16 %swp2, i16* %dst + + %Ld2 = load i16, i16* %src + %swp3 = tail call i16 @llvm.bswap.i16(i16 %Ld2) + store i16 %swp3, i16* %dst + + ret void +} + declare i64 @llvm.bswap.i64(i64) declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)