Index: llvm/trunk/lib/Target/NVPTX/LLVMBuild.txt =================================================================== --- llvm/trunk/lib/Target/NVPTX/LLVMBuild.txt +++ llvm/trunk/lib/Target/NVPTX/LLVMBuild.txt @@ -28,5 +28,5 @@ type = Library name = NVPTXCodeGen parent = NVPTX -required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo Scalar SelectionDAG Support Target TransformUtils +required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo Scalar SelectionDAG Support Target TransformUtils Vectorize add_to_library_groups = NVPTX Index: llvm/trunk/lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -46,6 +46,7 @@ #include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Vectorize.h" using namespace llvm; @@ -54,6 +55,13 @@ cl::desc("Optimize address spaces using NVPTXInferAddressSpaces instead of " "NVPTXFavorNonGenericAddrSpaces")); +// LSV is still relatively new; this switch lets us turn it off in case we +// encounter (or suspect) a bug. +static cl::opt + DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", + cl::desc("Disable load/store vectorizer"), + cl::init(false), cl::Hidden); + namespace llvm { void initializeNVVMIntrRangePass(PassRegistry&); void initializeNVVMReflectPass(PassRegistry&); @@ -258,6 +266,8 @@ addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); if (getOptLevel() != CodeGenOpt::None) { addAddressSpaceInferencePasses(); + if (!DisableLoadStoreVectorizer) + addPass(createLoadStoreVectorizerPass()); addStraightLineScalarOptimizationPasses(); } Index: llvm/trunk/test/CodeGen/NVPTX/LoadStoreVectorizer.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ llvm/trunk/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s | FileCheck -check-prefix=ENABLED %s +; RUN: llc -disable-nvptx-load-store-vectorizer < %s | FileCheck -check-prefix=DISABLED %s +target triple = "nvptx64-nvidia-cuda" + +; Check that the load-store vectorizer is enabled by default for nvptx, and +; that it's disabled by the appropriate flag. + +; ENABLED: ld.v2.{{.}}32 +; DISABLED: ld.{{.}}32 +; DISABLED: ld.{{.}}32 +define i32 @f(i32* %p) { + %p.1 = getelementptr i32, i32* %p, i32 1 + %v0 = load i32, i32* %p, align 8 + %v1 = load i32, i32* %p.1, align 4 + %sum = add i32 %v0, %v1 + ret i32 %sum +} Index: llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll =================================================================== --- llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll +++ llvm/trunk/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll @@ -45,10 +45,10 @@ ret void } ; PTX-LABEL: sum_of_array( -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} ; IR-LABEL: @sum_of_array( ; TODO: GVN is unable to preserve the "inbounds" keyword on the first GEP. Need @@ -90,10 +90,10 @@ ret void } ; PTX-LABEL: sum_of_array2( -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} ; IR-LABEL: @sum_of_array2( ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} @@ -140,10 +140,10 @@ ret void } ; PTX-LABEL: sum_of_array3( -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} ; IR-LABEL: @sum_of_array3( ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}} @@ -186,10 +186,10 @@ ret void } ; PTX-LABEL: sum_of_array4( -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} -; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}} +; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}} ; IR-LABEL: @sum_of_array4( ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}