Index: llvm/trunk/lib/Target/NVPTX/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/NVPTX/CMakeLists.txt +++ llvm/trunk/lib/Target/NVPTX/CMakeLists.txt @@ -12,7 +12,6 @@ NVPTXAllocaHoisting.cpp NVPTXAsmPrinter.cpp NVPTXAssignValidGlobalNames.cpp - NVPTXFavorNonGenericAddrSpaces.cpp NVPTXFrameLowering.cpp NVPTXGenericToNVVM.cpp NVPTXISelDAGToDAG.cpp Index: llvm/trunk/lib/Target/NVPTX/NVPTX.h =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTX.h +++ llvm/trunk/lib/Target/NVPTX/NVPTX.h @@ -45,7 +45,6 @@ llvm::CodeGenOpt::Level OptLevel); ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMPass(); -FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass(); FunctionPass *createNVPTXInferAddressSpacesPass(); FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); FunctionPass *createNVVMReflectPass(); Index: llvm/trunk/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp @@ -1,289 +0,0 @@ -//===-- NVPTXFavorNonGenericAddrSpace.cpp - ---------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// FIXME: This pass is deprecated in favor of NVPTXInferAddressSpaces, which -// uses a new algorithm that handles pointer induction variables. -// -// When a load/store accesses the generic address space, checks whether the -// address is casted from a non-generic address space. If so, remove this -// addrspacecast because accessing non-generic address spaces is typically -// faster. Besides removing addrspacecasts directly used by loads/stores, this -// optimization also recursively traces into a GEP's pointer operand and a -// bitcast's source to find more eliminable addrspacecasts. -// -// For instance, the code below loads a float from an array allocated in -// addrspace(3). -// -// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]* -// %1 = gep [10 x float]* %0, i64 0, i64 %i -// %2 = bitcast float* %1 to i32* -// %3 = load i32* %2 ; emits ld.u32 -// -// First, function hoistAddrSpaceCastFrom reorders the addrspacecast, the GEP, -// and the bitcast to expose more optimization opportunities to function -// optimizeMemoryInst. The intermediate code looks like: -// -// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i -// %1 = bitcast float addrspace(3)* %0 to i32 addrspace(3)* -// %2 = addrspacecast i32 addrspace(3)* %1 to i32* -// %3 = load i32* %2 ; still emits ld.u32, but will be optimized shortly -// -// Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed -// generic pointers, and folds the load and the addrspacecast into a load from -// the original address space. The final code looks like: -// -// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i -// %1 = bitcast float addrspace(3)* %0 to i32 addrspace(3)* -// %3 = load i32 addrspace(3)* %1 ; emits ld.shared.f32 -// -// This pass may remove an addrspacecast in a different BB. Therefore, we -// implement it as a FunctionPass. -// -// TODO: -// The current implementation doesn't handle PHINodes. Eliminating -// addrspacecasts used by PHINodes is trickier because PHINodes can introduce -// loops in data flow. For example, -// -// %generic.input = addrspacecast float addrspace(3)* %input to float* -// loop: -// %y = phi [ %generic.input, %y2 ] -// %y2 = getelementptr %y, 1 -// %v = load %y2 -// br ..., label %loop, ... -// -// Marking %y2 shared depends on marking %y shared, but %y also data-flow -// depends on %y2. We probably need an iterative fix-point algorithm on handle -// this case. -// -//===----------------------------------------------------------------------===// - -#include "NVPTX.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Operator.h" -#include "llvm/Support/CommandLine.h" - -using namespace llvm; - -// An option to disable this optimization. Enable it by default. -static cl::opt DisableFavorNonGeneric( - "disable-nvptx-favor-non-generic", - cl::init(false), - cl::desc("Do not convert generic address space usage " - "to non-generic address space usage"), - cl::Hidden); - -namespace { -/// \brief NVPTXFavorNonGenericAddrSpaces -class NVPTXFavorNonGenericAddrSpaces : public FunctionPass { -public: - static char ID; - NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {} - bool runOnFunction(Function &F) override; - -private: - /// Optimizes load/store instructions. Idx is the index of the pointer operand - /// (0 for load, and 1 for store). Returns true if it changes anything. - bool optimizeMemoryInstruction(Instruction *I, unsigned Idx); - /// Recursively traces into a GEP's pointer operand or a bitcast's source to - /// find an eliminable addrspacecast, and hoists that addrspacecast to the - /// outermost level. For example, this function transforms - /// bitcast(gep(gep(addrspacecast(X)))) - /// to - /// addrspacecast(bitcast(gep(gep(X)))). - /// - /// This reordering exposes to optimizeMemoryInstruction more - /// optimization opportunities on loads and stores. - /// - /// If this function successfully hoists an eliminable addrspacecast or V is - /// already such an addrspacecast, it returns the transformed value (which is - /// guaranteed to be an addrspacecast); otherwise, it returns nullptr. - Value *hoistAddrSpaceCastFrom(Value *V, int Depth = 0); - /// Helper function for GEPs. - Value *hoistAddrSpaceCastFromGEP(GEPOperator *GEP, int Depth); - /// Helper function for bitcasts. - Value *hoistAddrSpaceCastFromBitCast(BitCastOperator *BC, int Depth); -}; -} - -char NVPTXFavorNonGenericAddrSpaces::ID = 0; - -namespace llvm { -void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); -} -INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic", - "Remove unnecessary non-generic-to-generic addrspacecasts", - false, false) - -// Decides whether V is an addrspacecast and shortcutting V in load/store is -// valid and beneficial. -static bool isEliminableAddrSpaceCast(Value *V) { - // Returns false if V is not even an addrspacecast. - Operator *Cast = dyn_cast(V); - if (Cast == nullptr || Cast->getOpcode() != Instruction::AddrSpaceCast) - return false; - - Value *Src = Cast->getOperand(0); - PointerType *SrcTy = cast(Src->getType()); - PointerType *DestTy = cast(Cast->getType()); - // TODO: For now, we only handle the case where the addrspacecast only changes - // the address space but not the type. If the type also changes, we could - // still get rid of the addrspacecast by adding an extra bitcast, but we - // rarely see such scenarios. - if (SrcTy->getElementType() != DestTy->getElementType()) - return false; - - // Checks whether the addrspacecast is from a non-generic address space to the - // generic address space. - return (SrcTy->getAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC && - DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC); -} - -Value *NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP( - GEPOperator *GEP, int Depth) { - Value *NewOperand = - hoistAddrSpaceCastFrom(GEP->getPointerOperand(), Depth + 1); - if (NewOperand == nullptr) - return nullptr; - - // hoistAddrSpaceCastFrom returns an eliminable addrspacecast or nullptr. - assert(isEliminableAddrSpaceCast(NewOperand)); - Operator *Cast = cast(NewOperand); - - SmallVector Indices(GEP->idx_begin(), GEP->idx_end()); - Value *NewASC; - if (Instruction *GEPI = dyn_cast(GEP)) { - // GEP = gep (addrspacecast X), indices - // => - // NewGEP = gep X, indices - // NewASC = addrspacecast NewGEP - GetElementPtrInst *NewGEP = GetElementPtrInst::Create( - GEP->getSourceElementType(), Cast->getOperand(0), Indices, - "", GEPI); - NewGEP->setIsInBounds(GEP->isInBounds()); - NewGEP->takeName(GEP); - NewASC = new AddrSpaceCastInst(NewGEP, GEP->getType(), "", GEPI); - // Without RAUWing GEP, the compiler would visit GEP again and emit - // redundant instructions. This is exercised in test @rauw in - // access-non-generic.ll. - GEP->replaceAllUsesWith(NewASC); - } else { - // GEP is a constant expression. - Constant *NewGEP = ConstantExpr::getGetElementPtr( - GEP->getSourceElementType(), cast(Cast->getOperand(0)), - Indices, GEP->isInBounds()); - NewASC = ConstantExpr::getAddrSpaceCast(NewGEP, GEP->getType()); - } - return NewASC; -} - -Value *NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromBitCast( - BitCastOperator *BC, int Depth) { - Value *NewOperand = hoistAddrSpaceCastFrom(BC->getOperand(0), Depth + 1); - if (NewOperand == nullptr) - return nullptr; - - // hoistAddrSpaceCastFrom returns an eliminable addrspacecast or nullptr. - assert(isEliminableAddrSpaceCast(NewOperand)); - Operator *Cast = cast(NewOperand); - - // Cast = addrspacecast Src - // BC = bitcast Cast - // => - // Cast' = bitcast Src - // BC' = addrspacecast Cast' - Value *Src = Cast->getOperand(0); - Type *TypeOfNewCast = - PointerType::get(BC->getType()->getPointerElementType(), - Src->getType()->getPointerAddressSpace()); - Value *NewBC; - if (BitCastInst *BCI = dyn_cast(BC)) { - Value *NewCast = new BitCastInst(Src, TypeOfNewCast, "", BCI); - NewBC = new AddrSpaceCastInst(NewCast, BC->getType(), "", BCI); - NewBC->takeName(BC); - // Without RAUWing BC, the compiler would visit BC again and emit - // redundant instructions. This is exercised in test @rauw in - // access-non-generic.ll. - BC->replaceAllUsesWith(NewBC); - } else { - // BC is a constant expression. - Constant *NewCast = - ConstantExpr::getBitCast(cast(Src), TypeOfNewCast); - NewBC = ConstantExpr::getAddrSpaceCast(NewCast, BC->getType()); - } - return NewBC; -} - -Value *NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFrom(Value *V, - int Depth) { - // Returns V if V is already an eliminable addrspacecast. - if (isEliminableAddrSpaceCast(V)) - return V; - - // Limit the depth to prevent this recursive function from running too long. - const int MaxDepth = 20; - if (Depth >= MaxDepth) - return nullptr; - - // If V is a GEP or bitcast, hoist the addrspacecast if any from its pointer - // operand. This enables optimizeMemoryInstruction to shortcut addrspacecasts - // that are not directly used by the load/store. - if (GEPOperator *GEP = dyn_cast(V)) - return hoistAddrSpaceCastFromGEP(GEP, Depth); - - if (BitCastOperator *BC = dyn_cast(V)) - return hoistAddrSpaceCastFromBitCast(BC, Depth); - - return nullptr; -} - -bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI, - unsigned Idx) { - Value *NewOperand = hoistAddrSpaceCastFrom(MI->getOperand(Idx)); - if (NewOperand == nullptr) - return false; - - // load/store (addrspacecast X) => load/store X if shortcutting the - // addrspacecast is valid and can improve performance. - // - // e.g., - // %1 = addrspacecast float addrspace(3)* %0 to float* - // %2 = load float* %1 - // -> - // %2 = load float addrspace(3)* %0 - // - // Note: the addrspacecast can also be a constant expression. - assert(isEliminableAddrSpaceCast(NewOperand)); - Operator *ASC = dyn_cast(NewOperand); - MI->setOperand(Idx, ASC->getOperand(0)); - return true; -} - -bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) { - if (DisableFavorNonGeneric || skipFunction(F)) - return false; - - bool Changed = false; - for (BasicBlock &B : F) { - for (Instruction &I : B) { - if (isa(I)) { - // V = load P - Changed |= optimizeMemoryInstruction(&I, 0); - } else if (isa(I)) { - // store V, P - Changed |= optimizeMemoryInstruction(&I, 1); - } - } - } - return Changed; -} - -FunctionPass *llvm::createNVPTXFavorNonGenericAddrSpacesPass() { - return new NVPTXFavorNonGenericAddrSpaces(); -} Index: llvm/trunk/lib/Target/NVPTX/NVPTXLowerAlloca.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -20,8 +20,8 @@ // %Generic = addrspacecast i32 addrspace(5)* %A to i32* // store i32 0, i32 addrspace(5)* %Generic ; emits st.local.u32 // -// And we will rely on NVPTXFavorNonGenericAddrSpace to combine the last -// two instructions. +// And we will rely on NVPTXInferAddressSpaces to combine the last two +// instructions. // //===----------------------------------------------------------------------===// @@ -83,7 +83,7 @@ UI != UE; ) { // Check Load, Store, GEP, and BitCast Uses on alloca and make them // use the converted generic address, in order to expose non-generic - // addrspacecast to NVPTXFavorNonGenericAddrSpace. For other types + // addrspacecast to NVPTXInferAddressSpaces. For other types // of instructions this is unnecessary and may introduce redundant // address cast. const auto &AllocaUse = *UI++; Index: llvm/trunk/lib/Target/NVPTX/NVPTXLowerArgs.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -47,7 +47,7 @@ // ... // } // -// Later, NVPTXFavorNonGenericAddrSpaces will optimize it to +// Later, NVPTXInferAddressSpaces will optimize it to // // define void @foo(float* %input) { // %input2 = addrspacecast float* %input to float addrspace(1)* @@ -85,8 +85,8 @@ // ; use %b_generic // } // -// TODO: merge this pass with NVPTXFavorNonGenericAddrSpace so that other passes -// don't cancel the addrspacecast pair this pass emits. +// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't +// cancel the addrspacecast pair this pass emits. //===----------------------------------------------------------------------===// #include "NVPTX.h" @@ -116,7 +116,7 @@ void handleByValParam(Argument *Arg); // Knowing Ptr must point to the global address space, this function // addrspacecasts Ptr to global and then back to generic. This allows - // NVPTXFavorNonGenericAddrSpace to fold the global-to-generic cast into + // NVPTXInferAddressSpaces to fold the global-to-generic cast into // loads/stores that appear later. void markPointerAsGlobal(Value *Ptr); Index: llvm/trunk/lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -49,11 +49,6 @@ using namespace llvm; -static cl::opt UseInferAddressSpaces( - "nvptx-use-infer-addrspace", cl::init(true), cl::Hidden, - cl::desc("Optimize address spaces using NVPTXInferAddressSpaces instead of " - "NVPTXFavorNonGenericAddrSpaces")); - // LSV is still relatively new; this switch lets us turn it off in case we // encounter (or suspect) a bug. static cl::opt @@ -67,7 +62,6 @@ void initializeGenericToNVVMPass(PassRegistry&); void initializeNVPTXAllocaHoistingPass(PassRegistry &); void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); -void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); void initializeNVPTXInferAddressSpacesPass(PassRegistry &); void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerArgsPass(PassRegistry &); @@ -87,7 +81,6 @@ initializeGenericToNVVMPass(PR); initializeNVPTXAllocaHoistingPass(PR); initializeNVPTXAssignValidGlobalNamesPass(PR); - initializeNVPTXFavorNonGenericAddrSpacesPass(PR); initializeNVPTXInferAddressSpacesPass(PR); initializeNVPTXLowerArgsPass(PR); initializeNVPTXLowerAllocaPass(PR); @@ -206,15 +199,7 @@ // be eliminated by SROA. addPass(createSROAPass()); addPass(createNVPTXLowerAllocaPass()); - if (UseInferAddressSpaces) { - addPass(createNVPTXInferAddressSpacesPass()); - } else { - addPass(createNVPTXFavorNonGenericAddrSpacesPass()); - // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave - // them unused. We could remove dead code in an ad-hoc manner, but that - // requires manual work and might be error-prone. - addPass(createDeadCodeEliminationPass()); - } + addPass(createNVPTXInferAddressSpacesPass()); } void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { Index: llvm/trunk/test/CodeGen/NVPTX/access-non-generic.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/access-non-generic.ll +++ llvm/trunk/test/CodeGen/NVPTX/access-non-generic.ll @@ -1,9 +1,6 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix PTX ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-use-infer-addrspace=true | FileCheck %s --check-prefix PTX -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-use-infer-addrspace=false | FileCheck %s --check-prefix PTX -; RUN: opt < %s -S -nvptx-favor-non-generic -dce | FileCheck %s --check-prefix IR -; RUN: opt < %s -S -nvptx-infer-addrspace | FileCheck %s --check-prefix IR --check-prefix IR-WITH-LOOP +; RUN: opt < %s -S -nvptx-infer-addrspace | FileCheck %s --check-prefix IR @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4 @scalar = internal addrspace(3) global float 0.000000e+00, align 4 @@ -135,7 +132,7 @@ } define void @loop() { -; IR-WITH-LOOP-LABEL: @loop( +; IR-LABEL: @loop( entry: %p = addrspacecast [10 x float] addrspace(3)* @array to float* %end = getelementptr float, float* %p, i64 10 @@ -143,12 +140,12 @@ loop: %i = phi float* [ %p, %entry ], [ %i2, %loop ] -; IR-WITH-LOOP: phi float addrspace(3)* [ %p, %entry ], [ %i2, %loop ] +; IR: phi float addrspace(3)* [ %p, %entry ], [ %i2, %loop ] %v = load float, float* %i -; IR-WITH-LOOP: %v = load float, float addrspace(3)* %i +; IR: %v = load float, float addrspace(3)* %i call void @use(float %v) %i2 = getelementptr float, float* %i, i64 1 -; IR-WITH-LOOP: %i2 = getelementptr float, float addrspace(3)* %i, i64 1 +; IR: %i2 = getelementptr float, float addrspace(3)* %i, i64 1 %exit_cond = icmp eq float* %i2, %end br i1 %exit_cond, label %exit, label %loop @@ -159,7 +156,7 @@ @generic_end = external global float* define void @loop_with_generic_bound() { -; IR-WITH-LOOP-LABEL: @loop_with_generic_bound( +; IR-LABEL: @loop_with_generic_bound( entry: %p = addrspacecast [10 x float] addrspace(3)* @array to float* %end = load float*, float** @generic_end @@ -167,15 +164,15 @@ loop: %i = phi float* [ %p, %entry ], [ %i2, %loop ] -; IR-WITH-LOOP: phi float addrspace(3)* [ %p, %entry ], [ %i2, %loop ] +; IR: phi float addrspace(3)* [ %p, %entry ], [ %i2, %loop ] %v = load float, float* %i -; IR-WITH-LOOP: %v = load float, float addrspace(3)* %i +; IR: %v = load float, float addrspace(3)* %i call void @use(float %v) %i2 = getelementptr float, float* %i, i64 1 -; IR-WITH-LOOP: %i2 = getelementptr float, float addrspace(3)* %i, i64 1 +; IR: %i2 = getelementptr float, float addrspace(3)* %i, i64 1 %exit_cond = icmp eq float* %i2, %end -; IR-WITH-LOOP: addrspacecast float addrspace(3)* %i2 to float* -; IR-WITH-LOOP: icmp eq float* %{{[0-9]+}}, %end +; IR: addrspacecast float addrspace(3)* %i2 to float* +; IR: icmp eq float* %{{[0-9]+}}, %end br i1 %exit_cond, label %exit, label %loop exit: Index: llvm/trunk/test/CodeGen/NVPTX/addrspacecast.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/addrspacecast.ll +++ llvm/trunk/test/CodeGen/NVPTX/addrspacecast.ll @@ -1,8 +1,5 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -disable-nvptx-favor-non-generic \ -; RUN: -nvptx-use-infer-addrspace=false | FileCheck %s -check-prefix=PTX32 -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -disable-nvptx-favor-non-generic \ -; RUN: -nvptx-use-infer-addrspace=false | FileCheck %s -check-prefix=PTX64 - +; RUN: llc -O0 < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefix=PTX32 +; RUN: llc -O0 < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefix=PTX64 define i32 @conv1(i32 addrspace(1)* %ptr) { ; PTX32: conv1 Index: llvm/trunk/test/CodeGen/NVPTX/lower-alloca.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/lower-alloca.ll +++ llvm/trunk/test/CodeGen/NVPTX/lower-alloca.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -nvptx-lower-alloca -nvptx-favor-non-generic -dce | FileCheck %s +; RUN: opt < %s -S -nvptx-lower-alloca -nvptx-infer-addrspace | FileCheck %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" Index: llvm/trunk/test/CodeGen/NVPTX/shfl.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/shfl.ll +++ llvm/trunk/test/CodeGen/NVPTX/shfl.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -disable-nvptx-favor-non-generic | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s declare i32 @llvm.nvvm.shfl.down.i32(i32, i32, i32) declare float @llvm.nvvm.shfl.down.f32(float, i32, i32)