diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -316,16 +316,29 @@ // can be halved so that each half fits into a register. That's the case if // the element type fits into a register and the number of elements is a // power of 2 > 1. - if (auto *DataTypeVTy = dyn_cast(DataType)) { - unsigned NumElements = - cast(DataTypeVTy)->getNumElements(); - unsigned EltSize = DataTypeVTy->getElementType()->getScalarSizeInBits(); + if (auto *DataTypeTy = dyn_cast(DataType)) { + unsigned NumElements = DataTypeTy->getNumElements(); + unsigned EltSize = DataTypeTy->getElementType()->getScalarSizeInBits(); return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 && EltSize <= 128 && isPowerOf2_64(EltSize); } return BaseT::isLegalNTStore(DataType, Alignment); } + bool isLegalNTLoad(Type *DataType, Align Alignment) { + // Only supports little-endian targets. + if (ST->isLittleEndian()) { + if (auto *DataTypeTy = dyn_cast(DataType)) { + unsigned NumElements = DataTypeTy->getNumElements(); + unsigned EltSize = DataTypeTy->getElementType()->getScalarSizeInBits(); + return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 && + EltSize <= 128 && isPowerOf2_64(EltSize); + } + } + + return BaseT::isLegalNTLoad(DataType, Alignment); + } + bool enableOrderedReductions() const { return true; } InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll @@ -258,8 +258,7 @@ define i4 @test_i4_load(i4* %ddst) { ; CHECK-LABEL: define i4 @test_i4_load -; CHECK-LABEL: vector.body: -; CHECK: [[LOAD:%.*]] = load i4, i4* {{.*}}, align 1, !nontemporal !0 +; CHECK-NOT: vector.body: ; CHECk: ret i4 %{{.*}} ; entry: @@ -281,7 +280,7 @@ define i8 @test_load_i8(i8* %ddst) { ; CHECK-LABEL: @test_load_i8( -; CHECK-NOT: vector.body: +; CHECK: vector.body: ; CHECk: ret i8 %{{.*}} ; entry: @@ -303,7 +302,7 @@ define half @test_half_load(half* %ddst) { ; CHECK-LABEL: @test_half_load -; CHECK-NOT: vector.body: +; CHECK-LABEL: vector.body: ; CHECk: ret half %{{.*}} ; entry: @@ -325,7 +324,7 @@ define i16 @test_i16_load(i16* %ddst) { ; CHECK-LABEL: @test_i16_load -; CHECK-NOT: vector.body: +; CHECK-LABEL: vector.body: ; CHECk: ret i16 %{{.*}} ; entry: @@ -347,7 +346,7 @@ define i32 @test_i32_load(i32* %ddst) { ; CHECK-LABEL: @test_i32_load -; CHECK-NOT: vector.body: +; CHECK-LABEL: vector.body: ; CHECk: ret i32 %{{.*}} ; entry: @@ -413,7 +412,7 @@ define i64 @test_i64_load(i64* %ddst) { ; CHECK-LABEL: @test_i64_load -; CHECK-NOT: vector.body: +; CHECK-LABEL: vector.body: ; CHECk: ret i64 %{{.*}} ; entry: @@ -435,7 +434,7 @@ define double @test_double_load(double* %ddst) { ; CHECK-LABEL: @test_double_load -; CHECK-NOT: vector.body: +; CHECK-LABEL: vector.body: ; CHECk: ret double %{{.*}} ; entry: @@ -457,7 +456,7 @@ define i128 @test_i128_load(i128* %ddst) { ; CHECK-LABEL: @test_i128_load -; CHECK-NOT: vector.body: +; CHECK-LABEL: vector.body: ; CHECk: ret i128 %{{.*}} ; entry: