Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h @@ -309,6 +309,9 @@ MachineFunction &MF, unsigned Intrinsic) const override; + bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, + EVT NewVT) const override; + bool isTruncateFree(Type *Ty1, Type *Ty2) const override; bool isTruncateFree(EVT VT1, EVT VT2) const override; Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7657,6 +7657,33 @@ return false; } +bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtTy, + EVT NewVT) const { + // If we're reducing the load width in order to avoid having to use an extra + // instruction to do extension then it's probably a good idea. + if (ExtTy != ISD::NON_EXTLOAD) + return true; + // Don't reduce load width if it would prevent us from combining a shift into + // the offset. + MemSDNode *Mem = dyn_cast(Load); + assert(Mem); + const SDValue &Base = Mem->getBasePtr(); + if (Base.getOpcode() == ISD::ADD && + Base.getOperand(1).getOpcode() == ISD::SHL && + Base.getOperand(1).hasOneUse() && + Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { + // The shift can be combined if it matches the size of the value being + // loaded (and so reducing the width would make it not match). + uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); + uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8; + if (ShiftAmount == Log2_32(LoadBytes)) + return false; + } + // We have no reason to disallow reducing the load width, so allow it. + return true; +} + // Truncations from 64-bit GPR to 32-bit GPR is free. bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) Index: llvm/trunk/test/CodeGen/AArch64/arm64-fold-lsl.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-fold-lsl.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-fold-lsl.ll @@ -77,3 +77,309 @@ store i64 %val, i64* %arrayidx86, align 8 ret void } + +; Check that we combine a shift into the offset instead of using a narrower load +; when we have a load followed by a trunc + +define i32 @load_doubleword_trunc_word(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_word: +; CHECK: ldr x0, [x0, x1, lsl #3] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i32 + ret i32 %trunc +} + +define i16 @load_doubleword_trunc_halfword(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_halfword: +; CHECK: ldr x0, [x0, x1, lsl #3] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i16 + ret i16 %trunc +} + +define i8 @load_doubleword_trunc_byte(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_byte: +; CHECK: ldr x0, [x0, x1, lsl #3] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i8 + ret i8 %trunc +} + +define i16 @load_word_trunc_halfword(i32* %ptr, i64 %off) { +entry: +; CHECK-LABEL: load_word_trunc_halfword: +; CHECK: ldr w0, [x0, x1, lsl #2] + %idx = getelementptr inbounds i32, i32* %ptr, i64 %off + %x = load i32, i32* %idx, align 8 + %trunc = trunc i32 %x to i16 + ret i16 %trunc +} + +define i8 @load_word_trunc_byte(i32* %ptr, i64 %off) { +; CHECK-LABEL: load_word_trunc_byte: +; CHECK: ldr w0, [x0, x1, lsl #2] +entry: + %idx = getelementptr inbounds i32, i32* %ptr, i64 %off + %x = load i32, i32* %idx, align 8 + %trunc = trunc i32 %x to i8 + ret i8 %trunc +} + +define i8 @load_halfword_trunc_byte(i16* %ptr, i64 %off) { +; CHECK-LABEL: load_halfword_trunc_byte: +; CHECK: ldrh w0, [x0, x1, lsl #1] +entry: + %idx = getelementptr inbounds i16, i16* %ptr, i64 %off + %x = load i16, i16* %idx, align 8 + %trunc = trunc i16 %x to i8 + ret i8 %trunc +} + +; Check that we do use a narrower load, and so don't combine the shift, when +; the loaded value is zero-extended. + +define i64 @load_doubleword_trunc_word_zext(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_word_zext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #3 +; CHECK: ldr w0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i32 + %ext = zext i32 %trunc to i64 + ret i64 %ext +} + +define i64 @load_doubleword_trunc_halfword_zext(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_halfword_zext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #3 +; CHECK: ldrh w0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i16 + %ext = zext i16 %trunc to i64 + ret i64 %ext +} + +define i64 @load_doubleword_trunc_byte_zext(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_byte_zext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #3 +; CHECK: ldrb w0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i8 + %ext = zext i8 %trunc to i64 + ret i64 %ext +} + +define i64 @load_word_trunc_halfword_zext(i32* %ptr, i64 %off) { +; CHECK-LABEL: load_word_trunc_halfword_zext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #2 +; CHECK: ldrh w0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i32, i32* %ptr, i64 %off + %x = load i32, i32* %idx, align 8 + %trunc = trunc i32 %x to i16 + %ext = zext i16 %trunc to i64 + ret i64 %ext +} + +define i64 @load_word_trunc_byte_zext(i32* %ptr, i64 %off) { +; CHECK-LABEL: load_word_trunc_byte_zext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #2 +; CHECK: ldrb w0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i32, i32* %ptr, i64 %off + %x = load i32, i32* %idx, align 8 + %trunc = trunc i32 %x to i8 + %ext = zext i8 %trunc to i64 + ret i64 %ext +} + +define i64 @load_halfword_trunc_byte_zext(i16* %ptr, i64 %off) { +; CHECK-LABEL: load_halfword_trunc_byte_zext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #1 +; CHECK: ldrb w0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i16, i16* %ptr, i64 %off + %x = load i16, i16* %idx, align 8 + %trunc = trunc i16 %x to i8 + %ext = zext i8 %trunc to i64 + ret i64 %ext +} + +; Check that we do use a narrower load, and so don't combine the shift, when +; the loaded value is sign-extended. + +define i64 @load_doubleword_trunc_word_sext(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_word_sext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #3 +; CHECK: ldrsw x0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i32 + %ext = sext i32 %trunc to i64 + ret i64 %ext +} + +define i64 @load_doubleword_trunc_halfword_sext(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_halfword_sext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #3 +; CHECK: ldrsh x0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i16 + %ext = sext i16 %trunc to i64 + ret i64 %ext +} + +define i64 @load_doubleword_trunc_byte_sext(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_byte_sext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #3 +; CHECK: ldrsb x0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i8 + %ext = sext i8 %trunc to i64 + ret i64 %ext +} + +define i64 @load_word_trunc_halfword_sext(i32* %ptr, i64 %off) { +; CHECK-LABEL: load_word_trunc_halfword_sext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #2 +; CHECK: ldrsh x0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i32, i32* %ptr, i64 %off + %x = load i32, i32* %idx, align 8 + %trunc = trunc i32 %x to i16 + %ext = sext i16 %trunc to i64 + ret i64 %ext +} + +define i64 @load_word_trunc_byte_sext(i32* %ptr, i64 %off) { +; CHECK-LABEL: load_word_trunc_byte_sext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #2 +; CHECK: ldrsb x0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i32, i32* %ptr, i64 %off + %x = load i32, i32* %idx, align 8 + %trunc = trunc i32 %x to i8 + %ext = sext i8 %trunc to i64 + ret i64 %ext +} + +define i64 @load_halfword_trunc_byte_sext(i16* %ptr, i64 %off) { +; CHECK-LABEL: load_halfword_trunc_byte_sext: +; CHECK: lsl [[REG:x[0-9]+]], x1, #1 +; CHECK: ldrsb x0, [x0, [[REG]]] +entry: + %idx = getelementptr inbounds i16, i16* %ptr, i64 %off + %x = load i16, i16* %idx, align 8 + %trunc = trunc i16 %x to i8 + %ext = sext i8 %trunc to i64 + ret i64 %ext +} + +; Check that we don't combine the shift, and so will use a narrower load, when +; the shift is used more than once. + +define i32 @load_doubleword_trunc_word_reuse_shift(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_word_reuse_shift: +; CHECK: lsl x[[REG1:[0-9]+]], x1, #3 +; CHECK: ldr w[[REG2:[0-9]+]], [x0, x[[REG1]]] +; CHECL: add w0, w[[REG2]], w[[REG1]] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i32 + %lsl = shl i64 %off, 3 + %lsl.trunc = trunc i64 %lsl to i32 + %add = add i32 %trunc, %lsl.trunc + ret i32 %add +} + +define i16 @load_doubleword_trunc_halfword_reuse_shift(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_halfword_reuse_shift: +; CHECK: lsl x[[REG1:[0-9]+]], x1, #3 +; CHECK: ldrh w[[REG2:[0-9]+]], [x0, x[[REG1]]] +; CHECK: add w0, w[[REG2]], w[[REG1]] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i16 + %lsl = shl i64 %off, 3 + %lsl.trunc = trunc i64 %lsl to i16 + %add = add i16 %trunc, %lsl.trunc + ret i16 %add +} + +define i8 @load_doubleword_trunc_byte_reuse_shift(i64* %ptr, i64 %off) { +; CHECK-LABEL: load_doubleword_trunc_byte_reuse_shift: +; CHECK: lsl x[[REG1:[0-9]+]], x1, #3 +; CHECK: ldrb w[[REG2:[0-9]+]], [x0, x[[REG1]]] +; CHECK: add w0, w[[REG2]], w[[REG1]] +entry: + %idx = getelementptr inbounds i64, i64* %ptr, i64 %off + %x = load i64, i64* %idx, align 8 + %trunc = trunc i64 %x to i8 + %lsl = shl i64 %off, 3 + %lsl.trunc = trunc i64 %lsl to i8 + %add = add i8 %trunc, %lsl.trunc + ret i8 %add +} + +define i16 @load_word_trunc_halfword_reuse_shift(i32* %ptr, i64 %off) { +entry: +; CHECK-LABEL: load_word_trunc_halfword_reuse_shift: +; CHECK: lsl x[[REG1:[0-9]+]], x1, #2 +; CHECK: ldrh w[[REG2:[0-9]+]], [x0, x[[REG1]]] +; CHECK: add w0, w[[REG2]], w[[REG1]] + %idx = getelementptr inbounds i32, i32* %ptr, i64 %off + %x = load i32, i32* %idx, align 8 + %trunc = trunc i32 %x to i16 + %lsl = shl i64 %off, 2 + %lsl.trunc = trunc i64 %lsl to i16 + %add = add i16 %trunc, %lsl.trunc + ret i16 %add +} + +define i8 @load_word_trunc_byte_reuse_shift(i32* %ptr, i64 %off) { +; CHECK-LABEL: load_word_trunc_byte_reuse_shift: +; CHECK: lsl x[[REG1:[0-9]+]], x1, #2 +; CHECK: ldrb w[[REG2:[0-9]+]], [x0, x[[REG1]]] +; CHECK: add w0, w[[REG2]], w[[REG1]] +entry: + %idx = getelementptr inbounds i32, i32* %ptr, i64 %off + %x = load i32, i32* %idx, align 8 + %trunc = trunc i32 %x to i8 + %lsl = shl i64 %off, 2 + %lsl.trunc = trunc i64 %lsl to i8 + %add = add i8 %trunc, %lsl.trunc + ret i8 %add +} + +define i8 @load_halfword_trunc_byte_reuse_shift(i16* %ptr, i64 %off) { +; CHECK-LABEL: load_halfword_trunc_byte_reuse_shift: +; CHECK: lsl x[[REG1:[0-9]+]], x1, #1 +; CHECK: ldrb w[[REG2:[0-9]+]], [x0, x[[REG1]]] +; CHECK: add w0, w[[REG2]], w[[REG1]] +entry: + %idx = getelementptr inbounds i16, i16* %ptr, i64 %off + %x = load i16, i16* %idx, align 8 + %trunc = trunc i16 %x to i8 + %lsl = shl i64 %off, 1 + %lsl.trunc = trunc i64 %lsl to i8 + %add = add i8 %trunc, %lsl.trunc + ret i8 %add +} Index: llvm/trunk/test/CodeGen/AArch64/arm64-register-offset-addressing.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-register-offset-addressing.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-register-offset-addressing.ll @@ -2,8 +2,7 @@ define i8 @test_64bit_add(i16* %a, i64 %b) { ; CHECK-LABEL: test_64bit_add: -; CHECK: lsl [[REG:x[0-9]+]], x1, #1 -; CHECK: ldrb w0, [x0, [[REG]]] +; CHECK: ldrh w0, [x0, x1, lsl #1] ; CHECK: ret %tmp1 = getelementptr inbounds i16, i16* %a, i64 %b %tmp2 = load i16, i16* %tmp1