Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -31,6 +31,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -3176,6 +3177,25 @@ Known.One &= Known2.One; break; } + case ISD::CopyFromReg: { + auto R = cast(Op.getOperand(1)); + const unsigned Reg = R->getReg(); + + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + if (!TRI->isVirtualRegister(Reg)) + break; + + const MachineRegisterInfo *MRI = &MF->getRegInfo(); + if (!MRI->hasOneDef(Reg)) + break; + + const FunctionLoweringInfo::LiveOutInfo *LOI = FLI->GetLiveOutRegInfo(Reg); + if (!LOI || LOI->Known.getBitWidth() != BitWidth) + break; + + Known = LOI->Known; + break; + } case ISD::FrameIndex: case ISD::TargetFrameIndex: TLI->computeKnownBitsForFrameIndex(Op, Known, DemandedElts, *this, Depth); Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -19548,10 +19548,10 @@ DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32))) Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); - // If the operand types disagree, extend the shift amount to match. Since - // BT ignores high bits (like shifts) we can use anyextend. + // If the operand types disagree, extend or truncate the shift amount to match. + // Since BT ignores high bits (like shifts) we can use anyextend for the extension. if (Src.getValueType() != BitNo.getValueType()) - BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo); + BitNo = DAG.getAnyExtOrTrunc(BitNo, dl, Src.getValueType()); X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B, dl, MVT::i8); Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -108,6 +108,43 @@ ret void } +;CHECK-LABEL: {{^}}s_buffer_load_index_across_bb: +;CHECK-NOT: s_waitcnt; +;CHECK-NOT: v_or_b32 +;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen offset:8 +define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) { +main_body: + %tmp = shl i32 %index, 4 + br label %bb1 + +bb1: ; preds = %main_body + %tmp1 = or i32 %tmp, 8 + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0) + %bitcast = bitcast i32 %load to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}s_buffer_load_index_across_bb_merged: +;CHECK-NOT: s_waitcnt; +;CHECK-NOT: v_or_b32 +;CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen offset:8 +define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %desc, i32 %index) { +main_body: + %tmp = shl i32 %index, 4 + br label %bb1 + +bb1: ; preds = %main_body + %tmp1 = or i32 %tmp, 8 + %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0) + %tmp2 = or i32 %tmp1, 4 + %load2 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp2, i32 0) + %bitcast = bitcast i32 %load to float + %bitcast2 = bitcast i32 %load2 to float + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float %bitcast2, float undef, float undef, i1 true, i1 true) + ret void +} + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) Index: test/CodeGen/ARM/atomic-op.ll =================================================================== --- test/CodeGen/ARM/atomic-op.ll +++ test/CodeGen/ARM/atomic-op.ll @@ -183,11 +183,11 @@ ret void } -define void @func2() nounwind { +define void @func2(i16 %int_val) nounwind { entry: %val = alloca i16 %old = alloca i16 - store i16 31, i16* %val + store i16 %int_val, i16* %val ; CHECK: ldrex ; CHECK: cmp ; CHECK: strex @@ -197,7 +197,7 @@ ; CHECK-BAREMETAL-NOT: __sync %0 = atomicrmw umin i16* %val, i16 16 monotonic store i16 %0, i16* %old - %uneg = sub i16 0, 1 + %uneg = sub i16 0, 2 ; CHECK: ldrex ; CHECK: cmp ; CHECK: strex @@ -249,7 +249,7 @@ ; CHECK-T1-M0: bl ___sync_fetch_and_umin_1 ; CHECK-BAREMETAL: cmp ; CHECK-BAREMETAL-NOT: __sync - %uneg = sub i8 0, 1 + %uneg = sub i8 0, 2 %1 = atomicrmw umin i8* %val, i8 %uneg monotonic store i8 %1, i8* %old ; CHECK: ldrex Index: test/CodeGen/PowerPC/pr35688.ll =================================================================== --- test/CodeGen/PowerPC/pr35688.ll +++ test/CodeGen/PowerPC/pr35688.ll @@ -6,16 +6,14 @@ ; Function Attrs: nounwind define void @ec_GFp_nistp256_points_mul() { ; CHECK-LABEL: ec_GFp_nistp256_points_mul: -; CHECK: ld 5, 0(3) -; CHECK: li 3, 127 -; CHECK: li 4, 0 -; CHECK: subfic 6, 5, 0 -; CHECK: subfze 6, 4 -; CHECK: sradi 7, 6, 63 -; CHECK: srad 6, 6, 3 -; CHECK: subfc 5, 5, 7 -; CHECK: subfe 5, 4, 6 +; CHECK: ld 4, 0(3) +; CHECK: li 3, 0 +; CHECK: subfic 5, 4, 0 +; CHECK: subfze 5, 3 ; CHECK: sradi 5, 5, 63 +; CHECK: subfc 4, 4, 5 +; CHECK: subfe 4, 3, 5 +; CHECK: sradi 4, 4, 63 ; With MemorySSA, everything is taken out of the loop by licm. ; Loads and stores to undef are treated as non-aliasing. Index: test/CodeGen/SystemZ/subregliveness-04.ll =================================================================== --- test/CodeGen/SystemZ/subregliveness-04.ll +++ test/CodeGen/SystemZ/subregliveness-04.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -disable-early-taildup -disable-cgp -systemz-subreg-liveness < %s | FileCheck %s ; Check for successful compilation. -; CHECK: lhi %r0, -5 +; CHECK: lhi {{%r[0-9]+}}, -5 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" target triple = "s390x-ibm-linux" Index: test/CodeGen/X86/fold-tied-op.ll =================================================================== --- test/CodeGen/X86/fold-tied-op.ll +++ test/CodeGen/X86/fold-tied-op.ll @@ -6,8 +6,8 @@ target triple = "i386--netbsd" ; CHECK-LABEL: fn1 -; CHECK: orl {{.*#+}} 4-byte Folded Reload ; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload ; CHECK: xorl {{.*#+}} 4-byte Folded Reload ; CHECK: xorl {{.*#+}} 4-byte Folded Reload ; CHECK: retl Index: test/CodeGen/X86/pr28444.ll =================================================================== --- test/CodeGen/X86/pr28444.ll +++ test/CodeGen/X86/pr28444.ll @@ -11,9 +11,8 @@ define void @extractelt_mismatch_vector_element_type(i32 %arg, i1 %x) { ; CHECK-LABEL: extractelt_mismatch_vector_element_type: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: movb %al, (%rax) -; CHECK-NEXT: movb %al, (%rax) +; CHECK-NEXT: movb $1, (%rax) +; CHECK-NEXT: movb $1, (%rax) ; CHECK-NEXT: retq bb: %tmp = icmp ult i32 %arg, 0