Index: include/llvm/CodeGen/MachineValueType.h =================================================================== --- include/llvm/CodeGen/MachineValueType.h +++ include/llvm/CodeGen/MachineValueType.h @@ -55,85 +55,86 @@ FIRST_FP_VALUETYPE = f16, LAST_FP_VALUETYPE = ppcf128, - v2i1 = 13, // 2 x i1 - v4i1 = 14, // 4 x i1 - v8i1 = 15, // 8 x i1 - v16i1 = 16, // 16 x i1 - v32i1 = 17, // 32 x i1 - v64i1 = 18, // 64 x i1 - v512i1 = 19, // 512 x i1 - v1024i1 = 20, // 1024 x i1 - - v1i8 = 21, // 1 x i8 - v2i8 = 22, // 2 x i8 - v4i8 = 23, // 4 x i8 - v8i8 = 24, // 8 x i8 - v16i8 = 25, // 16 x i8 - v32i8 = 26, // 32 x i8 - v64i8 = 27, // 64 x i8 - v128i8 = 28, //128 x i8 - v256i8 = 29, //256 x i8 - - v1i16 = 30, // 1 x i16 - v2i16 = 31, // 2 x i16 - v4i16 = 32, // 4 x i16 - v8i16 = 33, // 8 x i16 - v16i16 = 34, // 16 x i16 - v32i16 = 35, // 32 x i16 - v64i16 = 36, // 64 x i16 - v128i16 = 37, //128 x i16 - - v1i32 = 38, // 1 x i32 - v2i32 = 39, // 2 x i32 - v4i32 = 40, // 4 x i32 - v8i32 = 41, // 8 x i32 - v16i32 = 42, // 16 x i32 - v32i32 = 43, // 32 x i32 - v64i32 = 44, // 64 x i32 - - v1i64 = 45, // 1 x i64 - v2i64 = 46, // 2 x i64 - v4i64 = 47, // 4 x i64 - v8i64 = 48, // 8 x i64 - v16i64 = 49, // 16 x i64 - v32i64 = 50, // 32 x i64 - - v1i128 = 51, // 1 x i128 - - FIRST_INTEGER_VECTOR_VALUETYPE = v2i1, + v1i1 = 13, // 2 x i1 + v2i1 = 14, // 2 x i1 + v4i1 = 15, // 4 x i1 + v8i1 = 16, // 8 x i1 + v16i1 = 17, // 16 x i1 + v32i1 = 18, // 32 x i1 + v64i1 = 19, // 64 x i1 + v512i1 = 20, // 512 x i1 + v1024i1 = 21, // 1024 x i1 + + v1i8 = 22, // 1 x i8 + v2i8 = 23, // 2 x i8 + v4i8 = 24, // 4 x i8 + v8i8 = 25, // 8 x i8 + v16i8 = 26, // 16 x i8 + v32i8 = 27, // 32 x i8 + v64i8 = 28, // 64 x i8 + v128i8 = 29, //128 x i8 + v256i8 = 30, //256 x i8 + + v1i16 = 31, // 1 x i16 + v2i16 = 32, // 2 x i16 + v4i16 = 33, // 4 x i16 + v8i16 = 34, // 8 x i16 + v16i16 = 35, // 16 x i16 + v32i16 = 36, // 32 x i16 + v64i16 = 37, // 64 x i16 + v128i16 = 38, //128 x i16 + + v1i32 = 39, // 1 x i32 + v2i32 = 40, // 2 x i32 + v4i32 = 41, // 4 x i32 + v8i32 = 42, // 8 x i32 + v16i32 = 43, // 16 x i32 + v32i32 = 44, // 32 x i32 + v64i32 = 45, // 64 x i32 + + v1i64 = 46, // 1 x i64 + v2i64 = 47, // 2 x i64 + v4i64 = 48, // 4 x i64 + v8i64 = 49, // 8 x i64 + v16i64 = 50, // 16 x i64 + v32i64 = 51, // 32 x i64 + + v1i128 = 52, // 1 x i128 + + FIRST_INTEGER_VECTOR_VALUETYPE = v1i1, LAST_INTEGER_VECTOR_VALUETYPE = v1i128, - v2f16 = 52, // 2 x f16 - v4f16 = 53, // 4 x f16 - v8f16 = 54, // 8 x f16 - v1f32 = 55, // 1 x f32 - v2f32 = 56, // 2 x f32 - v4f32 = 57, // 4 x f32 - v8f32 = 58, // 8 x f32 - v16f32 = 59, // 16 x f32 - v1f64 = 60, // 1 x f64 - v2f64 = 61, // 2 x f64 - v4f64 = 62, // 4 x f64 - v8f64 = 63, // 8 x f64 + v2f16 = 53, // 2 x f16 + v4f16 = 54, // 4 x f16 + v8f16 = 55, // 8 x f16 + v1f32 = 56, // 1 x f32 + v2f32 = 57, // 2 x f32 + v4f32 = 58, // 4 x f32 + v8f32 = 59, // 8 x f32 + v16f32 = 60, // 16 x f32 + v1f64 = 61, // 1 x f64 + v2f64 = 62, // 2 x f64 + v4f64 = 63, // 4 x f64 + v8f64 = 64, // 8 x f64 FIRST_FP_VECTOR_VALUETYPE = v2f16, LAST_FP_VECTOR_VALUETYPE = v8f64, - FIRST_VECTOR_VALUETYPE = v2i1, + FIRST_VECTOR_VALUETYPE = v1i1, LAST_VECTOR_VALUETYPE = v8f64, - x86mmx = 64, // This is an X86 MMX value + x86mmx = 65, // This is an X86 MMX value - Glue = 65, // This glues nodes together during pre-RA sched + Glue = 66, // This glues nodes together during pre-RA sched - isVoid = 66, // This has no value + isVoid = 67, // This has no value - Untyped = 67, // This value takes a register, but has + Untyped = 68, // This value takes a register, but has // unspecified type. The register class // will be determined by the opcode. FIRST_VALUETYPE = 0, // This is always the beginning of the list. - LAST_VALUETYPE = 68, // This always remains at the end of the list. + LAST_VALUETYPE = 69, // This always remains at the end of the list. // This is the current maximum for LAST_VALUETYPE. // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors @@ -311,6 +312,7 @@ switch (SimpleTy) { default: llvm_unreachable("Not a vector MVT!"); + case v1i1: case v2i1: case v4i1: case v8i1: @@ -413,6 +415,7 @@ case v2f16: case v2f32: case v2f64: return 2; + case v1i1: case v1i8: case v1i16: case v1i32: @@ -442,6 +445,7 @@ "in codegen and has no size"); case Metadata: llvm_unreachable("Value type is metadata."); + case v1i1: case i1 : return 1; case v2i1: return 2; case v4i1: return 4; @@ -587,6 +591,7 @@ default: break; case MVT::i1: + if (NumElements == 1) return MVT::v1i1; if (NumElements == 2) return MVT::v2i1; if (NumElements == 4) return MVT::v4i1; if (NumElements == 8) return MVT::v8i1; Index: include/llvm/CodeGen/ValueTypes.td =================================================================== --- include/llvm/CodeGen/ValueTypes.td +++ include/llvm/CodeGen/ValueTypes.td @@ -33,69 +33,70 @@ def f128 : ValueType<128, 11>; // 128-bit floating point value def ppcf128: ValueType<128, 12>; // PPC 128-bit floating point value -def v2i1 : ValueType<2 , 13>; // 2 x i1 vector value -def v4i1 : ValueType<4 , 14>; // 4 x i1 vector value -def v8i1 : ValueType<8 , 15>; // 8 x i1 vector value -def v16i1 : ValueType<16, 16>; // 16 x i1 vector value -def v32i1 : ValueType<32 , 17>; // 32 x i1 vector value -def v64i1 : ValueType<64 , 18>; // 64 x i1 vector value -def v512i1 : ValueType<512, 19>; // 512 x i1 vector value -def v1024i1: ValueType<1024,20>; //1024 x i1 vector value - -def v1i8 : ValueType<16, 21>; // 1 x i8 vector value -def v2i8 : ValueType<16 , 22>; // 2 x i8 vector value -def v4i8 : ValueType<32 , 23>; // 4 x i8 vector value -def v8i8 : ValueType<64 , 24>; // 8 x i8 vector value -def v16i8 : ValueType<128, 25>; // 16 x i8 vector value -def v32i8 : ValueType<256, 26>; // 32 x i8 vector value -def v64i8 : ValueType<512, 27>; // 64 x i8 vector value -def v128i8 : ValueType<1024,28>; //128 x i8 vector value -def v256i8 : ValueType<2048,29>; //256 x i8 vector value - -def v1i16 : ValueType<16 , 30>; // 1 x i16 vector value -def v2i16 : ValueType<32 , 31>; // 2 x i16 vector value -def v4i16 : ValueType<64 , 32>; // 4 x i16 vector value -def v8i16 : ValueType<128, 33>; // 8 x i16 vector value -def v16i16 : ValueType<256, 34>; // 16 x i16 vector value -def v32i16 : ValueType<512, 35>; // 32 x i16 vector value -def v64i16 : ValueType<1024,36>; // 64 x i16 vector value -def v128i16: ValueType<2048,37>; //128 x i16 vector value - -def v1i32 : ValueType<32 , 38>; // 1 x i32 vector value -def v2i32 : ValueType<64 , 39>; // 2 x i32 vector value -def v4i32 : ValueType<128, 40>; // 4 x i32 vector value -def v8i32 : ValueType<256, 41>; // 8 x i32 vector value -def v16i32 : ValueType<512, 42>; // 16 x i32 vector value -def v32i32 : ValueType<1024,43>; // 32 x i32 vector value -def v64i32 : ValueType<2048,44>; // 32 x i32 vector value - -def v1i64 : ValueType<64 , 45>; // 1 x i64 vector value -def v2i64 : ValueType<128, 46>; // 2 x i64 vector value -def v4i64 : ValueType<256, 47>; // 4 x i64 vector value -def v8i64 : ValueType<512, 48>; // 8 x i64 vector value -def v16i64 : ValueType<1024,49>; // 16 x i64 vector value -def v32i64 : ValueType<2048,50>; // 32 x i64 vector value - -def v1i128 : ValueType<128, 51>; // 1 x i128 vector value - -def v2f16 : ValueType<32 , 52>; // 2 x f16 vector value -def v4f16 : ValueType<64 , 53>; // 4 x f16 vector value -def v8f16 : ValueType<128, 54>; // 8 x f16 vector value -def v1f32 : ValueType<32 , 55>; // 1 x f32 vector value -def v2f32 : ValueType<64 , 56>; // 2 x f32 vector value -def v4f32 : ValueType<128, 57>; // 4 x f32 vector value -def v8f32 : ValueType<256, 58>; // 8 x f32 vector value -def v16f32 : ValueType<512, 59>; // 16 x f32 vector value -def v1f64 : ValueType<64, 60>; // 1 x f64 vector value -def v2f64 : ValueType<128, 61>; // 2 x f64 vector value -def v4f64 : ValueType<256, 62>; // 4 x f64 vector value -def v8f64 : ValueType<512, 63>; // 8 x f64 vector value - - -def x86mmx : ValueType<64 , 64>; // X86 MMX value -def FlagVT : ValueType<0 , 65>; // Pre-RA sched glue -def isVoid : ValueType<0 , 66>; // Produces no value -def untyped: ValueType<8 , 67>; // Produces an untyped value +def v1i1 : ValueType<1 , 13>; // 1 x i1 vector value +def v2i1 : ValueType<2 , 14>; // 2 x i1 vector value +def v4i1 : ValueType<4 , 15>; // 4 x i1 vector value +def v8i1 : ValueType<8 , 16>; // 8 x i1 vector value +def v16i1 : ValueType<16, 17>; // 16 x i1 vector value +def v32i1 : ValueType<32 , 18>; // 32 x i1 vector value +def v64i1 : ValueType<64 , 19>; // 64 x i1 vector value +def v512i1 : ValueType<512, 20>; // 512 x i1 vector value +def v1024i1: ValueType<1024,21>; //1024 x i1 vector value + +def v1i8 : ValueType<16, 22>; // 1 x i8 vector value +def v2i8 : ValueType<16 , 23>; // 2 x i8 vector value +def v4i8 : ValueType<32 , 24>; // 4 x i8 vector value +def v8i8 : ValueType<64 , 25>; // 8 x i8 vector value +def v16i8 : ValueType<128, 26>; // 16 x i8 vector value +def v32i8 : ValueType<256, 27>; // 32 x i8 vector value +def v64i8 : ValueType<512, 28>; // 64 x i8 vector value +def v128i8 : ValueType<1024,29>; //128 x i8 vector value +def v256i8 : ValueType<2048,30>; //256 x i8 vector value + +def v1i16 : ValueType<16 , 31>; // 1 x i16 vector value +def v2i16 : ValueType<32 , 32>; // 2 x i16 vector value +def v4i16 : ValueType<64 , 33>; // 4 x i16 vector value +def v8i16 : ValueType<128, 34>; // 8 x i16 vector value +def v16i16 : ValueType<256, 35>; // 16 x i16 vector value +def v32i16 : ValueType<512, 36>; // 32 x i16 vector value +def v64i16 : ValueType<1024,37>; // 64 x i16 vector value +def v128i16: ValueType<2048,38>; //128 x i16 vector value + +def v1i32 : ValueType<32 , 39>; // 1 x i32 vector value +def v2i32 : ValueType<64 , 40>; // 2 x i32 vector value +def v4i32 : ValueType<128, 41>; // 4 x i32 vector value +def v8i32 : ValueType<256, 42>; // 8 x i32 vector value +def v16i32 : ValueType<512, 43>; // 16 x i32 vector value +def v32i32 : ValueType<1024,44>; // 32 x i32 vector value +def v64i32 : ValueType<2048,45>; // 32 x i32 vector value + +def v1i64 : ValueType<64 , 46>; // 1 x i64 vector value +def v2i64 : ValueType<128, 47>; // 2 x i64 vector value +def v4i64 : ValueType<256, 48>; // 4 x i64 vector value +def v8i64 : ValueType<512, 49>; // 8 x i64 vector value +def v16i64 : ValueType<1024,50>; // 16 x i64 vector value +def v32i64 : ValueType<2048,51>; // 32 x i64 vector value + +def v1i128 : ValueType<128, 52>; // 1 x i128 vector value + +def v2f16 : ValueType<32 , 53>; // 2 x f16 vector value +def v4f16 : ValueType<64 , 54>; // 4 x f16 vector value +def v8f16 : ValueType<128, 55>; // 8 x f16 vector value +def v1f32 : ValueType<32 , 56>; // 1 x f32 vector value +def v2f32 : ValueType<64 , 57>; // 2 x f32 vector value +def v4f32 : ValueType<128, 58>; // 4 x f32 vector value +def v8f32 : ValueType<256, 59>; // 8 x f32 vector value +def v16f32 : ValueType<512, 60>; // 16 x f32 vector value +def v1f64 : ValueType<64, 61>; // 1 x f64 vector value +def v2f64 : ValueType<128, 62>; // 2 x f64 vector value +def v4f64 : ValueType<256, 63>; // 4 x f64 vector value +def v8f64 : ValueType<512, 64>; // 8 x f64 vector value + + +def x86mmx : ValueType<64 , 65>; // X86 MMX value +def FlagVT : ValueType<0 , 66>; // Pre-RA sched glue +def isVoid : ValueType<0 , 67>; // Produces no value +def untyped: ValueType<8 , 68>; // Produces an untyped value def token : ValueType<0 , 120>; // TokenTy def MetadataVT: ValueType<0, 121>; // Metadata Index: lib/IR/ValueTypes.cpp =================================================================== --- lib/IR/ValueTypes.cpp +++ lib/IR/ValueTypes.cpp @@ -142,6 +142,7 @@ case MVT::Other: return "ch"; case MVT::Glue: return "glue"; case MVT::x86mmx: return "x86mmx"; + case MVT::v1i1: return "v1i1"; case MVT::v2i1: return "v2i1"; case MVT::v4i1: return "v4i1"; case MVT::v8i1: return "v8i1"; @@ -220,6 +221,7 @@ case MVT::f128: return Type::getFP128Ty(Context); case MVT::ppcf128: return Type::getPPC_FP128Ty(Context); case MVT::x86mmx: return Type::getX86_MMXTy(Context); + case MVT::v1i1: return VectorType::get(Type::getInt1Ty(Context), 1); case MVT::v2i1: return VectorType::get(Type::getInt1Ty(Context), 2); case MVT::v4i1: return VectorType::get(Type::getInt1Ty(Context), 4); case MVT::v8i1: return VectorType::get(Type::getInt1Ty(Context), 8); Index: lib/Target/X86/X86CallingConv.td =================================================================== --- lib/Target/X86/X86CallingConv.td +++ lib/Target/X86/X86CallingConv.td @@ -74,7 +74,7 @@ CCIfByVal>, // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[v1i1, i8, i16], CCPromoteToType>, // Promote v8i1/v16i1/v32i1 arguments to i32. CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType>, @@ -147,7 +147,7 @@ def RetCC_#NAME : CallingConv<[ // Promote i1, v8i1 arguments to i8. - CCIfType<[i1, v8i1], CCPromoteToType>, + CCIfType<[v1i1, v8i1], CCPromoteToType>, // Promote v16i1 arguments to i16. CCIfType<[v16i1], CCPromoteToType>, @@ -207,7 +207,7 @@ // // For code that doesn't care about the ABI, we allow returning more than two // integer values in registers. - CCIfType<[i1], CCPromoteToType>, + CCIfType<[v1i1], CCPromoteToType>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, @@ -375,7 +375,7 @@ CCIfSwiftError>>, // For integers, ECX, R8D can be used as extra return registers. - CCIfType<[i1], CCPromoteToType>, + CCIfType<[v1i1], CCPromoteToType>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>, CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX, R8D]>>, @@ -486,7 +486,7 @@ CCIfByVal>, // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[v1i1, i8, i16], CCPromoteToType>, // The 'nest' parameter, if any, is passed in R10. CCIfNest>>, @@ -585,7 +585,7 @@ // FIXME: Handle varargs. // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[v1i1, i8, i16], CCPromoteToType>, // The 'nest' parameter, if any, is passed in R10. CCIfNest>, @@ -797,7 +797,7 @@ def CC_X86_32_C : CallingConv<[ // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[v1i1, i8, i16], CCPromoteToType>, // The 'nest' parameter, if any, is passed in ECX. CCIfNest>, @@ -817,7 +817,7 @@ CCIfByVal>, // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[v1i1, i8, i16], CCPromoteToType>, // If the call is not a vararg call, some arguments may be passed // in integer registers. @@ -829,7 +829,7 @@ def CC_X86_32_FastCall : CallingConv<[ // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[v1i1, i8, i16], CCPromoteToType>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest>, @@ -859,14 +859,14 @@ def CC_X86_32_ThisCall_Mingw : CallingConv<[ // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[v1i1, i8, i16], CCPromoteToType>, CCDelegateTo ]>; def CC_X86_32_ThisCall_Win : CallingConv<[ // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[v1i1, i8, i16], CCPromoteToType>, // Pass sret arguments indirectly through stack. CCIfSRet>, @@ -886,7 +886,7 @@ CCIfByVal>, // Promote i1/i8/i16 arguments to i32. - CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[v1i1, i8, i16], CCPromoteToType>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest>, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1171,7 +1171,7 @@ addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - addRegisterClass(MVT::i1, &X86::VK1RegClass); + addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); @@ -1186,16 +1186,6 @@ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); } - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::SETCC, MVT::i1, Custom); - setOperationAction(ISD::SETCCE, MVT::i1, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::XOR, MVT::i1, Legal); - setOperationAction(ISD::OR, MVT::i1, Legal); - setOperationAction(ISD::AND, MVT::i1, Legal); - setOperationAction(ISD::SUB, MVT::i1, Custom); - setOperationAction(ISD::ADD, MVT::i1, Custom); - setOperationAction(ISD::MUL, MVT::i1, Custom); for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16, MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32, @@ -1264,7 +1254,6 @@ setOperationAction(ISD::MSTORE, VT, Custom); } } - setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); @@ -1359,11 +1348,13 @@ setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); @@ -1772,7 +1763,7 @@ LLVMContext& Context, EVT VT) const { if (!VT.isVector()) - return Subtarget.hasAVX512() ? MVT::i1: MVT::i8; + return MVT::i8; if (VT.isSimple()) { MVT VVT = VT.getSimpleVT(); @@ -2545,6 +2536,9 @@ SelectionDAG &DAG) { SDValue ValReturned = ValArg; + if (ValVT == MVT::v1i1) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); + if (ValVT == MVT::v64i1) { // In 32 bit machine, this case is handled by getv64i1Argument assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); @@ -2567,7 +2561,6 @@ ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); } - return DAG.getBitcast(ValVT, ValReturned); } @@ -2873,8 +2866,11 @@ SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); - return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) - : Val; + return ExtendedInMem + ? (VA.getValVT().isVector() + ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) + : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) + : Val; } // FIXME: Get this from tablegen. @@ -3024,7 +3020,7 @@ RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; - else if (RegVT == MVT::i1) + else if (RegVT == MVT::v1i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; @@ -6943,7 +6939,7 @@ for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (!In.isUndef()) - Immediate |= cast(In)->getZExtValue() << idx; + Immediate |= (cast(In)->getZExtValue() & 0x1) << idx; } SDLoc dl(Op); MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); @@ -6986,12 +6982,12 @@ if (!isa(In)) NonConstIdx.push_back(idx); else { - Immediate |= cast(In)->getZExtValue() << idx; + Immediate |= (cast(In)->getZExtValue() & 0x1) << idx; HasConstElts = true; } if (SplatIdx < 0) SplatIdx = idx; - else if (In != Op.getOperand(SplatIdx)) + else if (IsSplat && In != Op.getOperand(SplatIdx)) IsSplat = false; } @@ -13993,7 +13989,6 @@ SDValue Idx = Op.getOperand(1); MVT EltVT = Op.getSimpleValueType(); - assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); @@ -14027,8 +14022,8 @@ DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(MaxSift, dl, MVT::i8)); - return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, - DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec, + DAG.getIntPtrConstant(0, dl)); } SDValue @@ -14039,7 +14034,7 @@ MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); - if (Op.getSimpleValueType() == MVT::i1) + if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG); if (!isa(Idx)) { @@ -14210,10 +14205,13 @@ return EltInVec; } - // Insertion of one bit into first or last position - // can be done with two SHIFTs + OR. + // Insertion of one bit into first position if (IdxVal == 0 ) { - // EltInVec already at correct index and other bits are 0. + // Clean top bits of vector. + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); + EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); // Clean the first bit in source vector. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); @@ -14222,6 +14220,7 @@ return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } + // Insertion of one bit into last position if (IdxVal == NumElems -1) { // Move the bit to the last position inside the vector. EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, @@ -17372,8 +17371,7 @@ if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); - assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) - && "SetCC type must be 8-bit or 1-bit integer"); + assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); @@ -17507,7 +17505,7 @@ if (SSECC != 8) { if (Subtarget.hasAVX512()) { - SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0, + SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); @@ -17555,9 +17553,10 @@ } // AVX512 fallback is to lower selects of scalar floats to masked moves. - if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) && - Subtarget.hasAVX512()) - return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2); + if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) { + SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); + return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); + } if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; @@ -19100,8 +19099,8 @@ /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). -/// The mask is coming as MVT::i8 and it should be truncated -/// to MVT::i1 while lowering masking intrinsics. +/// The mask is coming as MVT::i8 and it should be transformed +/// to MVT::v1i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using /// "X86select" instead of "vselect". We just can't create the "vselect" node /// for a scalar instruction. @@ -19109,24 +19108,30 @@ SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (isAllOnesConstant(Mask)) - return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - // The mask should be of type MVT::i1 - SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + + if (auto *MaskConst = dyn_cast(Mask)) { + if (MaskConst->getZExtValue() & 0x1) + return Op; + return PreservedSrc.isUndef() ? getZeroVector(VT, Subtarget, DAG, dl) + : PreservedSrc; + } if (Op.getOpcode() == X86ISD::FSETCCM || - Op.getOpcode() == X86ISD::FSETCCM_RND) + Op.getOpcode() == X86ISD::FSETCCM_RND) { + SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask); return DAG.getNode(ISD::AND, dl, VT, Op, IMask); - if (Op.getOpcode() == X86ISD::VFPCLASS || - Op.getOpcode() == X86ISD::VFPCLASSS) + } + if (Op.getOpcode() == X86ISD::VFPCLASSS) { + SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask); return DAG.getNode(ISD::OR, dl, VT, Op, IMask); + } if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc); + return DAG.getNode(X86ISD::SELECTS, dl, VT, Mask, Op, PreservedSrc); } static int getSEHRegistrationNodeSize(const Function *Fn) { @@ -19558,10 +19563,11 @@ SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); - SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask, + DAG.getIntPtrConstant(0, dl)); } case CMP_MASK: case CMP_MASK_CC: { @@ -19621,18 +19627,18 @@ if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd); } //default rounding mode if(!Cmp.getNode()) - Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); + Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask, + DAG.getIntPtrConstant(0, dl)); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; @@ -19680,13 +19686,13 @@ SDValue FCmp; if (isRoundModeCurDirection(Sae)) - FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8)); + FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8)); else - FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8), Sae); - // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg" - return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp); + FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8), Sae); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp, + DAG.getIntPtrConstant(0, dl)); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), @@ -23425,8 +23431,6 @@ assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"); - EVT EltVT = NVT.getVectorElementType(); - SDLoc dl(InOp); if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) { @@ -23444,6 +23448,8 @@ for (unsigned i = 0; i < InNumElts; ++i) Ops.push_back(InOp.getOperand(i)); + EVT EltVT = InOp.getOperand(0).getValueType(); + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT); for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) @@ -29607,8 +29613,9 @@ if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) - SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getAllOnesConstant(DL, CondVT)); + SDValue CondNew = + DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(APInt::getAllOnesValue(8), DL, CondVT)); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); } @@ -31356,13 +31363,11 @@ // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget.hasAVX512()) { - SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00, - CMP01, - DAG.getConstant(x86cc, DL, MVT::i8)); - if (N->getValueType(0) != MVT::i1) - return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), - FSetCC); - return FSetCC; + SDValue FSetCC = + DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, + DAG.getConstant(x86cc, DL, MVT::i8)); + return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0), + FSetCC, DAG.getIntPtrConstant(0, DL)); } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -31,8 +31,7 @@ RegisterClass KRCWM = !cast("VK" # NumElts # "WM"); // The mask VT. - ValueType KVT = !cast(!if (!eq (NumElts, 1), "i1", - "v" # NumElts # "i1")); + ValueType KVT = !cast("v" # NumElts # "i1"); // Suffix used in the instruction mnemonic. string Suffix = suffix; @@ -2263,7 +2262,7 @@ let Predicates = [HasAVX512] in { def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), (KMOVWmk addr:$dst, VK16:$src)>; - def : Pat<(i1 (load addr:$src)), + def : Pat<(v1i1 (load addr:$src)), (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>; def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))), (KMOVWkm addr:$src)>; @@ -2280,77 +2279,55 @@ } let Predicates = [HasAVX512] in { - def : Pat<(i1 (trunc (i64 GR64:$src))), - (COPY_TO_REGCLASS (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit), - (i32 1)), VK1)>; - - def : Pat<(i1 (trunc (i32 GR32:$src))), - (COPY_TO_REGCLASS (AND32ri8 $src, (i32 1)), VK1)>; - - def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))), - (COPY_TO_REGCLASS GR32:$src, VK1)>; - - def : Pat<(i1 (trunc (i8 GR8:$src))), - (COPY_TO_REGCLASS - (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR8:$src, sub_8bit), (i32 1)), VK1)>; - - def : Pat<(i1 (trunc (i16 GR16:$src))), - (COPY_TO_REGCLASS - (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR16:$src, sub_16bit), (i32 1)), VK1)>; - - def : Pat<(i32 (zext VK1:$src)), - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1))>; - - def : Pat<(i32 (anyext VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, GR32)>; - - def : Pat<(i8 (zext VK1:$src)), - (EXTRACT_SUBREG - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_8bit)>; - - def : Pat<(i8 (anyext VK1:$src)), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>; - - def : Pat<(i64 (zext VK1:$src)), - (SUBREG_TO_REG (i64 0), - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_32bit)>; - - def : Pat<(i64 (anyext VK1:$src)), - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>; + multiclass operation_gpr_mask_copy_lowering { + def : Pat<(maskVT (scalar_to_vector gprRC:$src)), + (COPY_TO_REGCLASS gprRC:$src, maskRC)>; + + def : Pat<(gprVT (X86Vextract maskRC:$src, (iPTR 0))), + (COPY_TO_REGCLASS maskRC:$src, gprRC)>; + + } + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + defm : operation_gpr_mask_copy_lowering; + + def : Pat<(i32 (anyext (i8 (X86Vextract VK64:$src, (iPTR 0))))), (COPY_TO_REGCLASS VK64:$src, GR32)>; + def : Pat<(i32 (anyext (i8 (X86Vextract VK32:$src, (iPTR 0))))), (COPY_TO_REGCLASS VK32:$src, GR32)>; + def : Pat<(i32 (anyext (i8 (X86Vextract VK16:$src, (iPTR 0))))), (COPY_TO_REGCLASS VK16:$src, GR32)>; + def : Pat<(i32 (anyext (i8 (X86Vextract VK8:$src, (iPTR 0))))), (COPY_TO_REGCLASS VK8:$src, GR32)>; + def : Pat<(i32 (anyext (i8 (X86Vextract VK4:$src, (iPTR 0))))), (COPY_TO_REGCLASS VK4:$src, GR32)>; + def : Pat<(i32 (anyext (i8 (X86Vextract VK2:$src, (iPTR 0))))), (COPY_TO_REGCLASS VK2:$src, GR32)>; + + def : Pat<(X86kshiftr (X86kshiftl (v1i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), + VK1)>; + def : Pat<(X86kshiftr (X86kshiftl (v16i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), + VK16)>; + def : Pat<(X86kshiftr (X86kshiftl (v8i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) , + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1))), + VK8)>; - def : Pat<(i16 (zext VK1:$src)), - (EXTRACT_SUBREG - (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_16bit)>; - - def : Pat<(i16 (anyext VK1:$src)), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>; -} -def : Pat<(v16i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK16)>; -def : Pat<(v8i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK8)>; -def : Pat<(v4i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK4)>; -def : Pat<(v2i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK2)>; -def : Pat<(v32i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK32)>; -def : Pat<(v64i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK64)>; - -def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; -def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; -def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; - -def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK4:$src, (iPTR 0))), (COPY_TO_REGCLASS VK4:$src, VK1)>; -def : Pat<(i1 (X86Vextract VK2:$src, (iPTR 0))), (COPY_TO_REGCLASS VK2:$src, VK1)>; +} // Mask unary operation // - KNOT @@ -2551,14 +2528,11 @@ def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>; def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>; + def : Pat<(v1i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK1)>; def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>; def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>; - let AddedComplexity = 10 in { // To optimize isel table. - def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>; - def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; - def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; - } + def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; } // Patterns for kmask insert_subvector/extract_subvector to/from index=0 @@ -2570,6 +2544,12 @@ def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))), (VT (COPY_TO_REGCLASS subRC:$src, RC))>; } +defm : operation_subvector_mask_lowering; +defm : operation_subvector_mask_lowering; +defm : operation_subvector_mask_lowering; +defm : operation_subvector_mask_lowering; +defm : operation_subvector_mask_lowering; +defm : operation_subvector_mask_lowering; defm : operation_subvector_mask_lowering; defm : operation_subvector_mask_lowering; @@ -3249,7 +3229,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector - (_.EltVT (X86selects (i1 (trunc GR32:$mask)), + (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), (_.EltVT _.FRC:$src1), (_.EltVT _.FRC:$src2))))))), (COPY_TO_REGCLASS (!cast(InstrStr#rrk) @@ -3260,7 +3240,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector - (_.EltVT (X86selects (i1 (trunc GR32:$mask)), + (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), (_.EltVT _.FRC:$src1), (_.EltVT ZeroFP))))))), (COPY_TO_REGCLASS (!cast(InstrStr#rrkz) @@ -3279,7 +3259,7 @@ (iPTR 0))), (iPTR 0)))), (!cast(InstrStr#mrk) addr:$dst, - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; } @@ -3296,7 +3276,7 @@ (iPTR 0))), (iPTR 0)))), (!cast(InstrStr#mrk) addr:$dst, - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; } @@ -3310,7 +3290,7 @@ (v16i32 immAllZerosV))))), (iPTR 0))), (!cast(InstrStr#rmkz) - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), addr:$srcAddr)>; def : Pat<(_.info128.VT (extract_subvector @@ -3322,7 +3302,7 @@ (iPTR 0))))), (iPTR 0))), (!cast(InstrStr#rmk) _.info128.RC:$src, - (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), addr:$srcAddr)>; } @@ -3338,7 +3318,7 @@ (v16i32 immAllZerosV))))), (iPTR 0))), (!cast(InstrStr#rmkz) - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), addr:$srcAddr)>; def : Pat<(_.info128.VT (extract_subvector @@ -3350,7 +3330,7 @@ (iPTR 0))))), (iPTR 0))), (!cast(InstrStr#rmk) _.info128.RC:$src, - (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), addr:$srcAddr)>; } @@ -3381,7 +3361,7 @@ VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), - (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM)), + (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; let hasSideEffects = 0 in Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -274,7 +274,7 @@ SDTCisSameNumEltsAs<0, 1>]>>; def X86selects : SDNode<"X86ISD::SELECTS", - SDTypeProfile<1, 3, [SDTCisVT<1, i1>, + SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>>; @@ -441,7 +441,7 @@ SDTCisSameNumEltsAs<0,1>, SDTCisVT<2, i32>]>, []>; def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", - SDTypeProfile<1, 2, [SDTCisVT<0, i1>, + SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>, SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>; def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", @@ -451,7 +451,7 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, - [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, + [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -6304,6 +6304,8 @@ // SrcReg(MaskReg) -> DestReg(GR64) // SrcReg(MaskReg) -> DestReg(GR32) + // SrcReg(MaskReg) -> DestReg(GR16) + // SrcReg(MaskReg) -> DestReg(GR8) // All KMASK RegClasses hold the same k registers, can be tested against anyone. if (X86::VK16RegClass.contains(SrcReg)) { @@ -6313,10 +6315,21 @@ } if (X86::GR32RegClass.contains(DestReg)) return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk; + if (X86::GR16RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return X86::KMOVWrk; + } + if (X86::GR8RegClass.contains(DestReg)) { + assert(!isHReg(DestReg) && "Cannot move between mask and h-reg"); + DestReg = getX86SubSuperRegister(DestReg, 32); + return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk; + } } // SrcReg(GR64) -> DestReg(MaskReg) // SrcReg(GR32) -> DestReg(MaskReg) + // SrcReg(GR16) -> DestReg(MaskReg) + // SrcReg(GR8) -> DestReg(MaskReg) // All KMASK RegClasses hold the same k registers, can be tested against anyone. if (X86::VK16RegClass.contains(DestReg)) { @@ -6326,9 +6339,17 @@ } if (X86::GR32RegClass.contains(SrcReg)) return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr; + if (X86::GR16RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return X86::KMOVWkr; + } + if (X86::GR8RegClass.contains(SrcReg)) { + assert(!isHReg(SrcReg) && "Cannot move between mask and h-reg"); + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr; + } } - // SrcReg(VR128) -> DestReg(GR64) // SrcReg(VR64) -> DestReg(GR64) // SrcReg(GR64) -> DestReg(VR128) Index: lib/Target/X86/X86RegisterInfo.td =================================================================== --- lib/Target/X86/X86RegisterInfo.td +++ lib/Target/X86/X86RegisterInfo.td @@ -511,7 +511,7 @@ 256, (sequence "YMM%u", 0, 31)>; // Mask registers -def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} +def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} @@ -519,7 +519,7 @@ def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} -def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} +def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;} def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} Index: test/CodeGen/X86/avx512-cmp.ll =================================================================== --- test/CodeGen/X86/avx512-cmp.ll +++ test/CodeGen/X86/avx512-cmp.ll @@ -47,16 +47,20 @@ ret float %c1 } -; FIXME: Can use vcmpeqss and extract from the mask here in AVX512. define i32 @test3(float %a, float %b) { -; ALL-LABEL: test3: -; ALL: ## BB#0: -; ALL-NEXT: vucomiss %xmm1, %xmm0 -; ALL-NEXT: setnp %al -; ALL-NEXT: sete %cl -; ALL-NEXT: andb %al, %cl -; ALL-NEXT: movzbl %cl, %eax -; ALL-NEXT: retq +; KNL-LABEL: test3: +; KNL: ## BB#0: +; KNL-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: retq +; +; SKX-LABEL: test3: +; SKX: ## BB#0: +; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: retq %cmp10.i = fcmp oeq float %a, %b %conv11.i = zext i1 %cmp10.i to i32 @@ -69,7 +73,7 @@ ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vucomiss %xmm1, %xmm0 ; ALL-NEXT: jne LBB3_1 -; ALL-NEXT: jp LBB3_1 +; ALL-NEXT: jp LBB3_1 ; ALL-NEXT: ## BB#2: ## %return ; ALL-NEXT: retq ; ALL-NEXT: LBB3_1: ## %if.end @@ -158,47 +162,22 @@ } define i32 @test10(i64 %b, i64 %c, i1 %d) { -; KNL-LABEL: test10: -; KNL: ## BB#0: -; KNL-NEXT: andl $1, %edx -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: cmpq %rsi, %rdi -; KNL-NEXT: sete %al -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: korw %k1, %k0, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB8_1 -; KNL-NEXT: ## BB#2: ## %if.end.i -; KNL-NEXT: movl $6, %eax -; KNL-NEXT: retq -; KNL-NEXT: LBB8_1: ## %if.then.i -; KNL-NEXT: movl $5, %eax -; KNL-NEXT: retq -; -; SKX-LABEL: test10: -; SKX: ## BB#0: -; SKX-NEXT: andl $1, %edx -; SKX-NEXT: kmovd %edx, %k0 -; SKX-NEXT: cmpq %rsi, %rdi -; SKX-NEXT: sete %al -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: korw %k1, %k0, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: testb %al, %al -; SKX-NEXT: je LBB8_1 -; SKX-NEXT: ## BB#2: ## %if.end.i -; SKX-NEXT: movl $6, %eax -; SKX-NEXT: retq -; SKX-NEXT: LBB8_1: ## %if.then.i -; SKX-NEXT: movl $5, %eax -; SKX-NEXT: retq +; ALL-LABEL: test10: +; ALL: ## BB#0: +; ALL-NEXT: movl %edx, %eax +; ALL-NEXT: andb $1, %al +; ALL-NEXT: cmpq %rsi, %rdi +; ALL-NEXT: sete %cl +; ALL-NEXT: orb %dl, %cl +; ALL-NEXT: andb $1, %cl +; ALL-NEXT: cmpb %cl, %al +; ALL-NEXT: je LBB8_1 +; ALL-NEXT: ## BB#2: ## %if.end.i +; ALL-NEXT: movl $6, %eax +; ALL-NEXT: retq +; ALL-NEXT: LBB8_1: ## %if.then.i +; ALL-NEXT: movl $5, %eax +; ALL-NEXT: retq %cmp8.i = icmp eq i64 %b, %c %or1 = or i1 %d, %cmp8.i Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -1552,10 +1552,10 @@ ; NOVL-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; NOVL-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpextrq $1, %xmm0, %rax +; NOVL-NEXT: vpextrb $8, %xmm0, %eax ; NOVL-NEXT: andl $1, %eax ; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 -; NOVL-NEXT: vmovq %xmm0, %rax +; NOVL-NEXT: vpextrb $0, %xmm0, %eax ; NOVL-NEXT: andl $1, %eax ; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 ; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] Index: test/CodeGen/X86/avx512-ext.ll =================================================================== --- test/CodeGen/X86/avx512-ext.ll +++ test/CodeGen/X86/avx512-ext.ll @@ -1434,26 +1434,26 @@ define i16 @trunc_i32_to_i1(i32 %a) { ; KNL-LABEL: trunc_i32_to_i1: ; KNL: ## BB#0: -; KNL-NEXT: andl $1, %edi -; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: movw $-4, %ax -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: andl $1, %edi +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX ; KNL-NEXT: retq ; ; SKX-LABEL: trunc_i32_to_i1: ; SKX: ## BB#0: -; SKX-NEXT: andl $1, %edi -; SKX-NEXT: kmovd %edi, %k0 ; SKX-NEXT: movw $-4, %ax -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kshiftrw $1, %k1, %k1 -; SKX-NEXT: kshiftlw $1, %k1, %k1 -; SKX-NEXT: korw %k0, %k1, %k0 +; SKX-NEXT: kmovd %eax, %k0 +; SKX-NEXT: kshiftrw $1, %k0, %k0 +; SKX-NEXT: kshiftlw $1, %k0, %k0 +; SKX-NEXT: andl $1, %edi +; SKX-NEXT: kmovw %edi, %k1 +; SKX-NEXT: korw %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AX %AX %EAX ; SKX-NEXT: retq Index: test/CodeGen/X86/avx512-fsel.ll =================================================================== --- test/CodeGen/X86/avx512-fsel.ll +++ test/CodeGen/X86/avx512-fsel.ll @@ -10,25 +10,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vucomiss %xmm1, %xmm0 -; CHECK-NEXT: setp %al -; CHECK-NEXT: setne %cl -; CHECK-NEXT: setnp %dl -; CHECK-NEXT: sete %sil -; CHECK-NEXT: andb %dl, %sil -; CHECK-NEXT: ## implicit-def: %EDI -; CHECK-NEXT: movb %sil, %dil -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: orb %al, %cl -; CHECK-NEXT: ## implicit-def: %EDI -; CHECK-NEXT: movb %cl, %dil -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kmovw %k1, %edi -; CHECK-NEXT: movb %dil, %al +; CHECK-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: xorb $-1, %al ; CHECK-NEXT: testb $1, %al -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: jmp LBB0_2 ; CHECK-NEXT: LBB0_1: ## %L_0 Index: test/CodeGen/X86/avx512-i1test.ll =================================================================== --- test/CodeGen/X86/avx512-i1test.ll +++ test/CodeGen/X86/avx512-i1test.ll @@ -66,14 +66,13 @@ define i64 @func2(i1 zeroext %i, i32 %j) { ; CHECK-LABEL: func2: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: # kill: %EDI %EDI %RDI ; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: je .LBB1_1 ; CHECK-NEXT: # BB#2: # %if.then ; CHECK-NEXT: jmp bar # TAILCALL ; CHECK-NEXT: .LBB1_1: # %return -; CHECK-NEXT: orq $-2, %rdi -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: orq $-2, %rax ; CHECK-NEXT: retq entry: %tobool = icmp eq i32 %j, 0 Index: test/CodeGen/X86/avx512-insert-extract.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract.ll +++ test/CodeGen/X86/avx512-insert-extract.ll @@ -260,8 +260,7 @@ ; KNL-NEXT: kshiftlw $11, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: testb %al, %al +; KNL-NEXT: testb $1, %al ; KNL-NEXT: je LBB10_2 ; KNL-NEXT: ## BB#1: ## %A ; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -275,9 +274,8 @@ ; SKX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; SKX-NEXT: kshiftlw $11, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: testb %al, %al +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: testb $1, %al ; SKX-NEXT: je LBB10_2 ; SKX-NEXT: ## BB#1: ## %A ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -299,13 +297,10 @@ ; KNL-LABEL: test12: ; KNL: ## BB#0: ; KNL-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 -; KNL-NEXT: vpcmpgtq %zmm1, %zmm3, %k1 -; KNL-NEXT: kunpckbw %k0, %k1, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: testb %al, %al +; KNL-NEXT: testb $1, %al ; KNL-NEXT: cmoveq %rsi, %rdi ; KNL-NEXT: movq %rdi, %rax ; KNL-NEXT: retq @@ -313,13 +308,10 @@ ; SKX-LABEL: test12: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 -; SKX-NEXT: vpcmpgtq %zmm1, %zmm3, %k1 -; SKX-NEXT: kunpckbw %k0, %k1, %k0 -; SKX-NEXT: kshiftlw $15, %k0, %k0 -; SKX-NEXT: kshiftrw $15, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: testb %al, %al +; SKX-NEXT: kshiftlb $7, %k0, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k0 +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: testb $1, %al ; SKX-NEXT: cmoveq %rsi, %rdi ; SKX-NEXT: movq %rdi, %rax ; SKX-NEXT: vzeroupper @@ -335,13 +327,13 @@ ; KNL: ## BB#0: ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al +; KNL-NEXT: movw $-4, %cx +; KNL-NEXT: kmovw %ecx, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k0 -; KNL-NEXT: movw $-4, %ax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX %AX %EAX ; KNL-NEXT: retq @@ -350,13 +342,13 @@ ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al +; SKX-NEXT: movw $-4, %cx +; SKX-NEXT: kmovd %ecx, %k0 +; SKX-NEXT: kshiftrw $1, %k0, %k0 +; SKX-NEXT: kshiftlw $1, %k0, %k0 ; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: movw $-4, %ax -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kshiftrw $1, %k1, %k1 -; SKX-NEXT: kshiftlw $1, %k1, %k1 -; SKX-NEXT: korw %k0, %k1, %k0 +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: korw %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AX %AX %EAX ; SKX-NEXT: retq @@ -373,8 +365,7 @@ ; KNL-NEXT: kshiftlw $11, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: testb %al, %al +; KNL-NEXT: testb $1, %al ; KNL-NEXT: cmoveq %rsi, %rdi ; KNL-NEXT: movq %rdi, %rax ; KNL-NEXT: retq @@ -384,9 +375,8 @@ ; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 ; SKX-NEXT: kshiftlb $3, %k0, %k0 ; SKX-NEXT: kshiftrb $7, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: testb %al, %al +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: testb $1, %al ; SKX-NEXT: cmoveq %rsi, %rdi ; SKX-NEXT: movq %rdi, %rax ; SKX-NEXT: vzeroupper @@ -424,11 +414,10 @@ define i16 @test16(i1 *%addr, i16 %a) { ; KNL-LABEL: test16: ; KNL: ## BB#0: -; KNL-NEXT: movzbl (%rdi), %eax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: movb (%rdi), %al +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] ; KNL-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 @@ -440,11 +429,10 @@ ; ; SKX-LABEL: test16: ; SKX: ## BB#0: -; SKX-NEXT: movzbl (%rdi), %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: vpmovm2d %k1, %zmm0 +; SKX-NEXT: movb (%rdi), %al +; SKX-NEXT: kmovd %esi, %k0 +; SKX-NEXT: vpmovm2d %k0, %zmm0 +; SKX-NEXT: kmovb %eax, %k0 ; SKX-NEXT: vpmovm2d %k0, %zmm1 ; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] ; SKX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 @@ -463,11 +451,10 @@ define i8 @test17(i1 *%addr, i8 %a) { ; KNL-LABEL: test17: ; KNL: ## BB#0: -; KNL-NEXT: movzbl (%rdi), %eax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: movb (%rdi), %al +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] ; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -479,11 +466,10 @@ ; ; SKX-LABEL: test17: ; SKX: ## BB#0: -; SKX-NEXT: movzbl (%rdi), %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: vpmovm2q %k1, %zmm0 +; SKX-NEXT: movb (%rdi), %al +; SKX-NEXT: kmovd %esi, %k0 +; SKX-NEXT: vpmovm2q %k0, %zmm0 +; SKX-NEXT: kmovb %eax, %k0 ; SKX-NEXT: vpmovm2q %k0, %zmm1 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] ; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -1283,12 +1269,11 @@ ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k1 -; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k2 -; SKX-NEXT: kunpckwd %k1, %k2, %k1 -; SKX-NEXT: vpmovm2w %k1, %zmm0 +; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0 +; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1 +; SKX-NEXT: kunpckwd %k0, %k1, %k0 +; SKX-NEXT: vpmovm2w %k0, %zmm0 +; SKX-NEXT: kmovb %eax, %k0 ; SKX-NEXT: vpmovm2w %k0, %zmm1 ; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 @@ -1308,33 +1293,29 @@ ; KNL: ## BB#0: ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpextrd $1, %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; KNL-NEXT: vmovd %xmm0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; KNL-NEXT: vpextrb $4, %xmm0, %ecx +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpextrb $0, %xmm0, %ecx +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] ; KNL-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 ; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 -; KNL-NEXT: vptestmq %zmm1, %zmm1, %k2 -; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] ; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; KNL-NEXT: vpextrd $3, %xmm0, %eax -; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vpextrb $12, %xmm0, %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] @@ -1349,10 +1330,9 @@ ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k1 -; SKX-NEXT: vpmovm2d %k1, %xmm0 +; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: kmovb %eax, %k0 ; SKX-NEXT: vpmovm2d %k0, %xmm1 ; SKX-NEXT: vpbroadcastq %xmm1, %xmm1 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] @@ -1373,16 +1353,14 @@ ; KNL: ## BB#0: ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; KNL-NEXT: vpextrb $0, %xmm0, %ecx +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] ; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -1396,13 +1374,12 @@ ; SKX: ## BB#0: ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; SKX-NEXT: kmovb %eax, %k1 ; SKX-NEXT: kshiftlw $1, %k1, %k1 -; SKX-NEXT: kshiftrw $1, %k1, %k1 ; SKX-NEXT: kshiftlw $1, %k0, %k0 -; SKX-NEXT: korw %k0, %k1, %k0 +; SKX-NEXT: kshiftrw $1, %k0, %k0 +; SKX-NEXT: korw %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: %AL %AL %EAX ; SKX-NEXT: retq @@ -1422,8 +1399,10 @@ ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpextrb $0, %xmm0, %eax -; KNL-NEXT: addb $4, %al -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_v2i1: @@ -1431,12 +1410,11 @@ ; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 ; SKX-NEXT: kshiftlw $15, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: cmpb $1, %al -; SKX-NEXT: movb $3, %al -; SKX-NEXT: adcb $0, %al -; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: retq %t1 = icmp ugt <2 x i64> %a, %b %t2 = extractelement <2 x i1> %t1, i32 0 @@ -1452,8 +1430,10 @@ ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpextrb $0, %xmm0, %eax -; KNL-NEXT: addb $4, %al -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: extractelement_v2i1_alt: @@ -1461,12 +1441,11 @@ ; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 ; SKX-NEXT: kshiftlw $15, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: cmpb $1, %al -; SKX-NEXT: movb $3, %al -; SKX-NEXT: adcb $0, %al -; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: retq %t1 = icmp ugt <2 x i64> %a, %b %t2 = extractelement <2 x i1> %t1, i32 0 @@ -1535,20 +1514,21 @@ ; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpextrb $15, %xmm0, %eax -; KNL-NEXT: addb $4, %al -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_v64i1: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; SKX-NEXT: kshiftrq $63, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: cmpb $1, %al -; SKX-NEXT: movb $3, %al -; SKX-NEXT: adcb $0, %al -; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <64 x i8> %a, %b @@ -1566,20 +1546,21 @@ ; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpextrb $15, %xmm0, %eax -; KNL-NEXT: addb $4, %al -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: retq ; ; SKX-LABEL: extractelement_v64i1_alt: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; SKX-NEXT: kshiftrq $63, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: cmpb $1, %al -; SKX-NEXT: movb $3, %al -; SKX-NEXT: adcb $0, %al -; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <64 x i8> %a, %b @@ -2332,7 +2313,7 @@ ; SKX-NEXT: vpmovm2q %k0, %xmm0 ; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: andl $1, %edi -; SKX-NEXT: movl -24(%rsp,%rdi,8), %eax +; SKX-NEXT: movzbl -24(%rsp,%rdi,8), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: retq %t1 = icmp ugt <2 x i64> %a, %b @@ -2362,7 +2343,7 @@ ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: andl $3, %edi -; SKX-NEXT: movl -24(%rsp,%rdi,4), %eax +; SKX-NEXT: movzbl -24(%rsp,%rdi,4), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: retq %t1 = icmp ugt <4 x i32> %a, %b @@ -2391,7 +2372,7 @@ ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa64 %zmm0, (%rsp) ; KNL-NEXT: andl $7, %edi -; KNL-NEXT: movl (%rsp,%rdi,8), %eax +; KNL-NEXT: movzbl (%rsp,%rdi,8), %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp @@ -2414,7 +2395,7 @@ ; SKX-NEXT: vpmovm2q %k0, %zmm0 ; SKX-NEXT: vmovdqa64 %zmm0, (%rsp) ; SKX-NEXT: andl $7, %edi -; SKX-NEXT: movl (%rsp,%rdi,8), %eax +; SKX-NEXT: movzbl (%rsp,%rdi,8), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp @@ -2444,7 +2425,7 @@ ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vmovdqa32 %zmm0, (%rsp) ; KNL-NEXT: andl $15, %edi -; KNL-NEXT: movl (%rsp,%rdi,4), %eax +; KNL-NEXT: movzbl (%rsp,%rdi,4), %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp @@ -2467,7 +2448,7 @@ ; SKX-NEXT: vpmovm2d %k0, %zmm0 ; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) ; SKX-NEXT: andl $15, %edi -; SKX-NEXT: movl (%rsp,%rdi,4), %eax +; SKX-NEXT: movzbl (%rsp,%rdi,4), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp @@ -2500,9 +2481,8 @@ ; KNL-NEXT: vmovdqa %ymm0, (%rsp) ; KNL-NEXT: andl $31, %edi ; KNL-NEXT: movq %rsp, %rax -; KNL-NEXT: movb (%rdi,%rax), %al -; KNL-NEXT: andb $1, %al -; KNL-NEXT: movzbl %al, %eax +; KNL-NEXT: movzbl (%rdi,%rax), %eax +; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq @@ -2524,7 +2504,7 @@ ; SKX-NEXT: vpmovm2w %k0, %zmm0 ; SKX-NEXT: vmovdqu16 %zmm0, (%rsp) ; SKX-NEXT: andl $31, %edi -; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax +; SKX-NEXT: movzbl (%rsp,%rdi,2), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp Index: test/CodeGen/X86/avx512-insert-extract_i1.ll =================================================================== --- test/CodeGen/X86/avx512-insert-extract_i1.ll +++ test/CodeGen/X86/avx512-insert-extract_i1.ll @@ -22,9 +22,8 @@ ; SKX-NEXT: vmovdqu8 %zmm0, (%rsp) ; SKX-NEXT: andl $63, %edi ; SKX-NEXT: movq %rsp, %rax -; SKX-NEXT: movb (%rdi,%rax), %al -; SKX-NEXT: andb $1, %al -; SKX-NEXT: movzbl %al, %eax +; SKX-NEXT: movzbl (%rdi,%rax), %eax +; SKX-NEXT: andl $1, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp ; SKX-NEXT: vzeroupper Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2881,23 +2881,23 @@ ; CHECK-LABEL: test_mask_vextractf32x4: ; CHECK: ## BB#0: ; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kshiftlw $12, %k1, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kshiftlw $12, %k0, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kshiftlw $13, %k0, %k2 ; CHECK-NEXT: kshiftrw $15, %k2, %k2 -; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftlw $15, %k0, %k3 ; CHECK-NEXT: kshiftrw $15, %k3, %k3 -; CHECK-NEXT: kshiftlw $14, %k1, %k1 -; CHECK-NEXT: kshiftrw $15, %k1, %k1 -; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kshiftlw $14, %k0, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: kmovw %k3, %ecx ; CHECK-NEXT: vmovd %ecx, %xmm2 -; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; CHECK-NEXT: kmovw %k2, %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq @@ -2911,23 +2911,23 @@ ; CHECK-LABEL: test_mask_vextracti64x4: ; CHECK: ## BB#0: ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kshiftlw $12, %k1, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kshiftlw $12, %k0, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kshiftlw $13, %k0, %k2 ; CHECK-NEXT: kshiftrw $15, %k2, %k2 -; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftlw $15, %k0, %k3 ; CHECK-NEXT: kshiftrw $15, %k3, %k3 -; CHECK-NEXT: kshiftlw $14, %k1, %k1 -; CHECK-NEXT: kshiftrw $15, %k1, %k1 -; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kshiftlw $14, %k0, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: kmovw %k3, %ecx ; CHECK-NEXT: vmovd %ecx, %xmm2 -; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 ; CHECK-NEXT: kmovw %k2, %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vpmovsxdq %xmm2, %ymm2 ; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 @@ -2942,23 +2942,23 @@ ; CHECK-LABEL: test_maskz_vextracti32x4: ; CHECK: ## BB#0: ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: kshiftlw $12, %k1, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kmovw %edi, %k0 +; CHECK-NEXT: kshiftlw $12, %k0, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kshiftlw $13, %k0, %k2 ; CHECK-NEXT: kshiftrw $15, %k2, %k2 -; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftlw $15, %k0, %k3 ; CHECK-NEXT: kshiftrw $15, %k3, %k3 -; CHECK-NEXT: kshiftlw $14, %k1, %k1 -; CHECK-NEXT: kshiftrw $15, %k1, %k1 -; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kshiftlw $14, %k0, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: kmovw %k3, %ecx ; CHECK-NEXT: vmovd %ecx, %xmm1 -; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; CHECK-NEXT: kmovw %k2, %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -121,6 +121,8 @@ ; CHECK-NEXT: kmovw %eax, %k2 ; CHECK-NEXT: kxorw %k0, %k1, %k0 ; CHECK-NEXT: kxorw %k0, %k2, %k0 +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: kxnorw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: ## kill: %AX %AX %EAX ; CHECK-NEXT: retq @@ -269,7 +271,6 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_sqrt_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1} @@ -296,7 +297,6 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_sqrt_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1} @@ -2214,7 +2214,6 @@ define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_rn: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2226,7 +2225,6 @@ define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_rd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2238,7 +2236,6 @@ define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_ru: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2250,7 +2247,6 @@ define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_rz: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2262,7 +2258,6 @@ define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_current: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2274,7 +2269,6 @@ define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_add_ss_rn: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2294,7 +2288,6 @@ define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_ss_current_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 @@ -2311,7 +2304,6 @@ define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_add_ss_current_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2329,7 +2321,6 @@ define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_rn: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2341,7 +2332,6 @@ define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_rd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2353,7 +2343,6 @@ define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_ru: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2365,7 +2354,6 @@ define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_rz: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2377,7 +2365,6 @@ define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_current: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2389,7 +2376,6 @@ define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_add_sd_rn: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2409,7 +2395,6 @@ define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_add_sd_current_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 @@ -2424,7 +2409,6 @@ define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_add_sd_current_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2440,7 +2424,6 @@ define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_ss_sae: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2452,7 +2435,6 @@ define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_ss_sae: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2472,7 +2454,6 @@ define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm0 @@ -2484,7 +2465,6 @@ define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2504,7 +2484,6 @@ define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_ss_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 @@ -2521,7 +2500,6 @@ define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_ss_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2538,7 +2516,6 @@ define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_sd_sae: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2550,7 +2527,6 @@ define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_sd_sae: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2570,7 +2546,6 @@ define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm0 @@ -2582,7 +2557,6 @@ define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -2602,7 +2576,6 @@ define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_mask_max_sd_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 @@ -2617,7 +2590,6 @@ define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) { ; CHECK-LABEL: test_maskz_max_sd_memfold: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -3651,7 +3623,6 @@ define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { ; CHECK-LABEL: test_getexp_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1} @@ -3678,7 +3649,6 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { ; CHECK-LABEL: test_getexp_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 {%k1} @@ -3705,12 +3675,9 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8) @@ -3720,19 +3687,18 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all: ; CHECK: ## BB#0: +; CHECK-NEXT: vcmplesd %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %esi ; CHECK-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0 -; CHECK-NEXT: vcmplesd %xmm1, %xmm0, %k1 -; CHECK-NEXT: korw %k0, %k1, %k0 -; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k1 -; CHECK-NEXT: vcmpneqsd %xmm1, %xmm0, %k2 -; CHECK-NEXT: korw %k1, %k2, %k1 -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k2 -; CHECK-NEXT: kandw %k2, %k1, %k1 -; CHECK-NEXT: korw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %edx +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %ecx +; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: ## kill: %AL %AL %EAX +; CHECK-NEXT: orb %sil, %dl +; CHECK-NEXT: orb %cl, %al +; CHECK-NEXT: orb %dl, %al ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4) @@ -3751,12 +3717,9 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4) @@ -3767,19 +3730,18 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all: ; CHECK: ## BB#0: -; CHECK-NEXT: vcmpless %xmm1, %xmm0, %k1 -; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: vcmpless %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %esi +; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %edx ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k2 {%k1} -; CHECK-NEXT: kmovw %k2, %ecx -; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1} -; CHECK-NEXT: kmovw %k1, %edx -; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovw %k0, %ecx +; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: andb %sil, %dl ; CHECK-NEXT: andb %cl, %al ; CHECK-NEXT: andb %dl, %al -; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4) %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8) @@ -3898,7 +3860,6 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1} @@ -3924,7 +3885,6 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z} @@ -4056,7 +4016,6 @@ define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0 @@ -4073,7 +4032,6 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0 @@ -4596,7 +4554,6 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm3 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} @@ -4620,7 +4577,6 @@ define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm3 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z} @@ -4690,7 +4646,6 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm3 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} @@ -4714,7 +4669,6 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm3 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} @@ -4815,7 +4769,6 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm3 ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} @@ -4843,7 +4796,6 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm3 ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1} @@ -4871,7 +4823,6 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm0, %xmm3 ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z} @@ -4889,7 +4840,6 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -4903,7 +4853,6 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1} @@ -4931,7 +4880,6 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1} @@ -4958,7 +4906,6 @@ ; CHECK-LABEL: fmadd_ss_mask_memfold: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: andl $1, %edx ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} ; CHECK-NEXT: vmovss %xmm0, (%rdi) @@ -4986,7 +4933,6 @@ ; CHECK-LABEL: fmadd_ss_maskz_memfold: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: andl $1, %edx ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vmovss %xmm0, (%rdi) @@ -5014,7 +4960,6 @@ ; CHECK-LABEL: fmadd_sd_mask_memfold: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: andl $1, %edx ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1} ; CHECK-NEXT: vmovlpd %xmm0, (%rdi) @@ -5038,7 +4983,6 @@ ; CHECK-LABEL: fmadd_sd_maskz_memfold: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: andl $1, %edx ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vmovlpd %xmm0, (%rdi) @@ -5063,7 +5007,6 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1} @@ -5091,7 +5034,6 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1} @@ -5119,7 +5061,6 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1} @@ -5147,7 +5088,6 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1} @@ -5173,7 +5113,6 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 @@ -5187,7 +5126,6 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq @@ -5201,8 +5139,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm: ; CHECK: ## BB#0: -; CHECK-NEXT: kxorw %k0, %k0, %k1 -; CHECK-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq %q = load float, float* %ptr_b %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 Index: test/CodeGen/X86/avx512-load-store.ll =================================================================== --- test/CodeGen/X86/avx512-load-store.ll +++ test/CodeGen/X86/avx512-load-store.ll @@ -12,7 +12,7 @@ ; CHECK32-LABEL: test_mm_mask_move_ss: ; CHECK32: # BB#0: # %entry ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al -; CHECK32-NEXT: andl $1, %eax +; CHECK32-NEXT: andb $1, %al ; CHECK32-NEXT: kmovw %eax, %k1 ; CHECK32-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} ; CHECK32-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] @@ -37,9 +37,9 @@ ; CHECK32-LABEL: test_mm_maskz_move_ss: ; CHECK32: # BB#0: # %entry ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al -; CHECK32-NEXT: andl $1, %eax -; CHECK32-NEXT: kmovw %eax, %k1 +; CHECK32-NEXT: andb $1, %al ; CHECK32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK32-NEXT: kmovw %eax, %k1 ; CHECK32-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} ; CHECK32-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; CHECK32-NEXT: retl @@ -62,7 +62,7 @@ ; CHECK32-LABEL: test_mm_mask_move_sd: ; CHECK32: # BB#0: # %entry ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al -; CHECK32-NEXT: andl $1, %eax +; CHECK32-NEXT: andb $1, %al ; CHECK32-NEXT: kmovw %eax, %k1 ; CHECK32-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1} ; CHECK32-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] @@ -87,9 +87,9 @@ ; CHECK32-LABEL: test_mm_maskz_move_sd: ; CHECK32: # BB#0: # %entry ; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al -; CHECK32-NEXT: andl $1, %eax -; CHECK32-NEXT: kmovw %eax, %k1 +; CHECK32-NEXT: andb $1, %al ; CHECK32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK32-NEXT: kmovw %eax, %k1 ; CHECK32-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} ; CHECK32-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; CHECK32-NEXT: retl Index: test/CodeGen/X86/avx512-mask-bugfix.ll =================================================================== --- test/CodeGen/X86/avx512-mask-bugfix.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s - -; ModuleID = 'foo.ll' -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -; Function Attrs: nounwind readnone -declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #0 - -; Function Attrs: nounwind readnone -declare i64 @llvm.cttz.i64(i64, i1) #0 - -; Function Attrs: nounwind -define void @foo(float* noalias %aFOO, float %b, i32 %a) { -allocas: - %full_mask_memory.i57 = alloca <8 x float> - %return_value_memory.i60 = alloca i1 - %cmp.i = icmp eq i32 %a, 65535 - br i1 %cmp.i, label %all_on, label %some_on - -all_on: - %mask0 = load <8 x float>, <8 x float>* %full_mask_memory.i57 - %v0.i.i.i70 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) #0 - %allon.i.i76 = icmp eq i32 %v0.i.i.i70, 65535 - br i1 %allon.i.i76, label %check_neighbors.i.i121, label %domixed.i.i100 - -domixed.i.i100: - br label %check_neighbors.i.i121 - -check_neighbors.i.i121: - %v1.i5.i.i116 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) #0 - %alleq.i.i120 = icmp eq i32 %v1.i5.i.i116, 65535 - br i1 %alleq.i.i120, label %all_equal.i.i123, label %not_all_equal.i.i124 - -; CHECK: kxnorw %k0, %k0, %k0 -; CHECK: kshiftrw $15, %k0, %k0 -; CHECK: jmp -; CHECK: kxorw %k0, %k0, %k0 - -all_equal.i.i123: - br label %reduce_equal___vyi.exit128 - -not_all_equal.i.i124: - br label %reduce_equal___vyi.exit128 - -reduce_equal___vyi.exit128: - %calltmp2.i125 = phi i1 [ true, %all_equal.i.i123 ], [ false, %not_all_equal.i.i124 ] - store i1 %calltmp2.i125, i1* %return_value_memory.i60 - %return_value.i126 = load i1, i1* %return_value_memory.i60 - %. = select i1 %return_value.i126, i32 1, i32 0 - %select_to_float = sitofp i32 %. to float - ret void - -some_on: - ret void -} - Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -418,8 +418,7 @@ ; KNL-NEXT: kshiftlw $10, %k0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: andb $1, %al ; KNL-NEXT: retq ; ; SKX-LABEL: zext_test3: @@ -427,9 +426,8 @@ ; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ; SKX-NEXT: kshiftlw $10, %k0, %k0 ; SKX-NEXT: kshiftrw $15, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: ## kill: %AL %AL %EAX +; SKX-NEXT: kmovb %k0, %eax +; SKX-NEXT: andb $1, %al ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq ; @@ -438,9 +436,8 @@ ; AVX512BW-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kshiftlw $10, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: andl $1, %eax -; AVX512BW-NEXT: ## kill: %AL %AL %EAX +; AVX512BW-NEXT: kmovw %k0, %eax +; AVX512BW-NEXT: andb $1, %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -449,9 +446,8 @@ ; AVX512DQ-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 ; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 -; AVX512DQ-NEXT: kmovw %k0, %eax -; AVX512DQ-NEXT: andl $1, %eax -; AVX512DQ-NEXT: ## kill: %AL %AL %EAX +; AVX512DQ-NEXT: kmovb %k0, %eax +; AVX512DQ-NEXT: andb $1, %al ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq %cmp_res = icmp ugt <16 x i32> %a, %b @@ -965,8 +961,8 @@ ; SKX-LABEL: test16: ; SKX: ## BB#0: ; SKX-NEXT: kmovq %rdi, %k0 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kshiftrw $15, %k1, %k1 +; SKX-NEXT: movb $1, %al +; SKX-NEXT: kmovb %eax, %k1 ; SKX-NEXT: vpmovm2b %k1, %zmm0 ; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 ; SKX-NEXT: vpmovm2b %k0, %zmm1 @@ -981,8 +977,8 @@ ; AVX512BW-LABEL: test16: ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovq %rdi, %k0 -; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $15, %k1, %k1 +; AVX512BW-NEXT: movb $1, %al +; AVX512BW-NEXT: kmovw %eax, %k1 ; AVX512BW-NEXT: vpmovm2b %k1, %zmm0 ; AVX512BW-NEXT: vpsllq $40, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 @@ -1085,8 +1081,7 @@ ; SKX-NEXT: kmovq %rdi, %k0 ; SKX-NEXT: cmpl %edx, %esi ; SKX-NEXT: setg %al -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: kmovb %eax, %k1 ; SKX-NEXT: vpmovm2b %k1, %zmm0 ; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 ; SKX-NEXT: vpmovm2b %k0, %zmm1 @@ -1103,8 +1098,7 @@ ; AVX512BW-NEXT: kmovq %rdi, %k0 ; AVX512BW-NEXT: cmpl %edx, %esi ; AVX512BW-NEXT: setg %al -; AVX512BW-NEXT: andl $1, %eax -; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kmovw %eax, %k1 ; AVX512BW-NEXT: vpmovm2b %k1, %zmm0 ; AVX512BW-NEXT: vpsllq $40, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 @@ -1166,21 +1160,21 @@ ; KNL-LABEL: test18: ; KNL: ## BB#0: ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: kshiftlw $7, %k2, %k0 -; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k2 ; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k3 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k3} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: kshiftlw $7, %k0, %k0 -; KNL-NEXT: korw %k0, %k1, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq @@ -1209,21 +1203,21 @@ ; AVX512BW-LABEL: test18: ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: kshiftlw $7, %k2, %k0 -; AVX512BW-NEXT: kshiftrw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512BW-NEXT: kmovd %esi, %k0 +; AVX512BW-NEXT: kshiftlw $7, %k0, %k2 ; AVX512BW-NEXT: kshiftrw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $15, %k0, %k3 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k3} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $7, %k2, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: vzeroupper @@ -1383,10 +1377,8 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { ; KNL-LABEL: store_v1i1: ; KNL: ## BB#0: -; KNL-NEXT: andl $1, %edi ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kxnorw %k0, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: movb %al, (%rsi) @@ -1394,20 +1386,16 @@ ; ; SKX-LABEL: store_v1i1: ; SKX: ## BB#0: -; SKX-NEXT: andl $1, %edi ; SKX-NEXT: kmovd %edi, %k0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kshiftrw $15, %k1, %k1 ; SKX-NEXT: kxorw %k1, %k0, %k0 ; SKX-NEXT: kmovb %k0, (%rsi) ; SKX-NEXT: retq ; ; AVX512BW-LABEL: store_v1i1: ; AVX512BW: ## BB#0: -; AVX512BW-NEXT: andl $1, %edi ; AVX512BW-NEXT: kmovd %edi, %k0 ; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $15, %k1, %k1 ; AVX512BW-NEXT: kxorw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movb %al, (%rsi) @@ -1415,10 +1403,8 @@ ; ; AVX512DQ-LABEL: store_v1i1: ; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: andl $1, %edi ; AVX512DQ-NEXT: kmovw %edi, %k0 ; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1 -; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 ; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 ; AVX512DQ-NEXT: kmovb %k0, (%rsi) ; AVX512DQ-NEXT: retq @@ -1610,59 +1596,14 @@ @f1.v = internal unnamed_addr global i1 false, align 4 define void @f1(i32 %c) { -; KNL-LABEL: f1: -; KNL: ## BB#0: ## %entry -; KNL-NEXT: movzbl {{.*}}(%rip), %edi -; KNL-NEXT: movl %edi, %eax -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k0 -; KNL-NEXT: kxnorw %k0, %k0, %k1 -; KNL-NEXT: kshiftrw $15, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, {{.*}}(%rip) -; KNL-NEXT: xorl $1, %edi -; KNL-NEXT: jmp _f2 ## TAILCALL -; -; SKX-LABEL: f1: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: movzbl {{.*}}(%rip), %edi -; SKX-NEXT: movl %edi, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kshiftrw $15, %k1, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 -; SKX-NEXT: kmovb %k0, {{.*}}(%rip) -; SKX-NEXT: xorl $1, %edi -; SKX-NEXT: jmp _f2 ## TAILCALL -; -; AVX512BW-LABEL: f1: -; AVX512BW: ## BB#0: ## %entry -; AVX512BW-NEXT: movzbl {{.*}}(%rip), %edi -; AVX512BW-NEXT: movl %edi, %eax -; AVX512BW-NEXT: andl $1, %eax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $15, %k1, %k1 -; AVX512BW-NEXT: kxorw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, {{.*}}(%rip) -; AVX512BW-NEXT: xorl $1, %edi -; AVX512BW-NEXT: jmp _f2 ## TAILCALL -; -; AVX512DQ-LABEL: f1: -; AVX512DQ: ## BB#0: ## %entry -; AVX512DQ-NEXT: movzbl {{.*}}(%rip), %edi -; AVX512DQ-NEXT: movl %edi, %eax -; AVX512DQ-NEXT: andl $1, %eax -; AVX512DQ-NEXT: kmovw %eax, %k0 -; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1 -; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k0, %k0 -; AVX512DQ-NEXT: kmovb %k0, {{.*}}(%rip) -; AVX512DQ-NEXT: xorl $1, %edi -; AVX512DQ-NEXT: jmp _f2 ## TAILCALL +; CHECK-LABEL: f1: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movzbl {{.*}}(%rip), %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorb $1, %al +; CHECK-NEXT: movb %al, {{.*}}(%rip) +; CHECK-NEXT: xorl $1, %edi +; CHECK-NEXT: jmp _f2 ## TAILCALL entry: %.b1 = load i1, i1* @f1.v, align 4 %not..b1 = xor i1 %.b1, true Index: test/CodeGen/X86/avx512-memfold.ll =================================================================== --- test/CodeGen/X86/avx512-memfold.ll +++ test/CodeGen/X86/avx512-memfold.ll @@ -4,12 +4,9 @@ define i8 @test_int_x86_avx512_mask_cmp_ss(<4 x float> %a, float* %b, i8 %mask) { ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vcmpunordss (%rdi), %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %b.val = load float, float* %b %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 @@ -24,7 +21,6 @@ define <4 x float> @test_mask_max_ss(<4 x float> %a, float* %b, i8 %mask) { ; CHECK-LABEL: test_mask_max_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -41,7 +37,6 @@ define <4 x float> @test_maskz_add_ss(<4 x float> %a, float* %b, i8 %mask) { ; CHECK-LABEL: test_maskz_add_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -61,7 +56,6 @@ define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, double* %c, i8 %mask){ ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %esi ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq Index: test/CodeGen/X86/avx512-regcall-NoMask.ll =================================================================== --- test/CodeGen/X86/avx512-regcall-NoMask.ll +++ test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -1,16 +1,10 @@ -; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=X32 %s -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=WIN64 %s +; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=ALL --check-prefix=X32 %s +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=ALL --check-prefix=WIN64 %s ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=LINUXOSX64 %s -; X32-LABEL: test_argReti1: -; X32: kmov{{.*}} %eax, %k{{[0-7]}} -; X32: kmov{{.*}} %k{{[0-7]}}, %eax -; X32: ret{{.*}} - -; WIN64-LABEL: test_argReti1: -; WIN64: kmov{{.*}} %eax, %k{{[0-7]}} -; WIN64: kmov{{.*}} %k{{[0-7]}}, %eax -; WIN64: ret{{.*}} +; ALL-LABEL: test_argReti1: +; ALL: incb %al +; ALL: ret{{.*}} ; Test regcall when receiving/returning i1 define x86_regcallcc i1 @test_argReti1(i1 %a) { @@ -18,17 +12,11 @@ ret i1 %add } -; X32-LABEL: test_CallargReti1: -; X32: kmov{{.*}} %k{{[0-7]}}, %eax -; X32: call{{.*}} {{.*}}test_argReti1 -; X32: kmov{{.*}} %eax, %k{{[0-7]}} -; X32: ret{{.*}} - -; WIN64-LABEL: test_CallargReti1: -; WIN64: kmov{{.*}} %k{{[0-7]}}, %eax -; WIN64: call{{.*}} {{.*}}test_argReti1 -; WIN64: kmov{{.*}} %eax, %k{{[0-7]}} -; WIN64: ret{{.*}} +; ALL-LABEL: test_CallargReti1: +; ALL: movzbl %al, %eax +; ALL: call{{.*}}test_argReti1 +; ALL: incb %al +; ALL: ret{{.*}} ; Test regcall when passing/retrieving i1 define x86_regcallcc i1 @test_CallargReti1(i1 %a) { Index: test/CodeGen/X86/avx512-select.ll =================================================================== --- test/CodeGen/X86/avx512-select.ll +++ test/CodeGen/X86/avx512-select.ll @@ -161,7 +161,7 @@ define double @pr30561_f64(double %b, double %a, i1 %c) { ; CHECK-LABEL: pr30561_f64: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} ; CHECK-NEXT: retq @@ -172,7 +172,7 @@ define float @pr30561_f32(float %b, float %a, i1 %c) { ; CHECK-LABEL: pr30561_f32: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; CHECK-NEXT: retq Index: test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -13,10 +13,9 @@ ; CHECK-NEXT: kshiftlb $6, %k0, %k0 ; CHECK-NEXT: kshiftrb $7, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vmovq %rax, %xmm2 -; CHECK-NEXT: kmovw %k1, %eax -; CHECK-NEXT: vmovq %rax, %xmm3 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; CHECK-NEXT: kmovw %k1, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm2 +; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vpsllq $63, %xmm2, %xmm2 ; CHECK-NEXT: vpsraq $63, %zmm2, %zmm2 ; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 Index: test/CodeGen/X86/avx512dq-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512dq-intrinsics.ll +++ test/CodeGen/X86/avx512dq-intrinsics.ll @@ -262,8 +262,7 @@ define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vreducess $4, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vreducess $4, {sae}, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 @@ -279,8 +278,7 @@ define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_range_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 @@ -296,8 +294,7 @@ define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vreducesd $4, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vreducesd $4, {sae}, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 @@ -313,8 +310,7 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_range_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 @@ -367,16 +363,12 @@ define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) { ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sd: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vfpclasssd $2, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: kmovb %k0, %ecx ; CHECK-NEXT: vfpclasssd $4, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: kmovb %k0, %eax ; CHECK-NEXT: addb %cl, %al -; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 2, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1) @@ -389,16 +381,12 @@ define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) { ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ss: ; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kmovb %edi, %k1 ; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: kmovb %k0, %ecx ; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: kmovb %k0, %eax ; CHECK-NEXT: addb %cl, %al -; CHECK-NEXT: ## kill: %AL %AL %EAX ; CHECK-NEXT: retq %res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 %x1) %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1) Index: test/CodeGen/X86/avx512er-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512er-intrinsics.ll +++ test/CodeGen/X86/avx512er-intrinsics.ll @@ -121,9 +121,7 @@ define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) { ; CHECK-LABEL: test_rsqrt28_ss_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0] +; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ; ret <4 x float> %res @@ -132,10 +130,7 @@ define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) { ; CHECK-LABEL: test_rsqrt28_ss_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] +; CHECK-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ; ret <4 x float> %res @@ -144,9 +139,7 @@ define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) { ; CHECK-LABEL: test_rsqrt28_sd_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0] +; CHECK-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xcd,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ; ret <2 x double> %res @@ -155,10 +148,7 @@ define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0) { ; CHECK-LABEL: test_rsqrt28_sd_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] +; CHECK-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xcd,0xc1] ; CHECK-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 7, i32 8) ; ret <2 x double> %res @@ -169,9 +159,7 @@ define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) { ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07] +; CHECK-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0xcd,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] %mem = load double , double * %ptr, align 8 %mem_v = insertelement <2 x double> undef, double %mem, i32 0 @@ -182,9 +170,7 @@ define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) { ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem_offset: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] -; CHECK-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12] +; CHECK-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0xcd,0x47,0x12] ; CHECK-NEXT: retq # encoding: [0xc3] %ptr1 = getelementptr double, double* %ptr, i32 18 %mem = load double , double * %ptr1, align 8 Index: test/CodeGen/X86/fast-isel-load-i1.ll =================================================================== --- test/CodeGen/X86/fast-isel-load-i1.ll +++ test/CodeGen/X86/fast-isel-load-i1.ll @@ -4,9 +4,7 @@ define i1 @test_i1(i1* %b) { ; CHECK-LABEL: test_i1: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movzbl (%rdi), %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: testb $1, (%rdi) ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # BB#1: # %in ; CHECK-NEXT: xorl %eax, %eax Index: test/CodeGen/X86/fma-fneg-combine.ll =================================================================== --- test/CodeGen/X86/fma-fneg-combine.ll +++ test/CodeGen/X86/fma-fneg-combine.ll @@ -141,8 +141,7 @@ ; SKX-LABEL: test11: ; SKX: # BB#0: # %entry ; SKX-NEXT: vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm0 -; SKX-NEXT: andl $1, %edi -; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq ; @@ -150,7 +149,6 @@ ; KNL: # BB#0: # %entry ; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm0 ; KNL-NEXT: vxorps %xmm0, %xmm2, %xmm0 -; KNL-NEXT: andl $1, %edi ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq @@ -186,18 +184,17 @@ ; SKX-LABEL: test13: ; SKX: # BB#0: # %entry ; SKX-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: andl $1, %edi -; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: kmovb %edi, %k1 ; SKX-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq ; ; KNL-LABEL: test13: ; KNL: # BB#0: # %entry ; KNL-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: andl $1, %edi ; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq + entry: %sub.i = fsub <2 x double> , %a %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %sub.i, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4) Index: test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- test/CodeGen/X86/masked_gather_scatter.ll +++ test/CodeGen/X86/masked_gather_scatter.ll @@ -300,8 +300,8 @@ ; ; KNL_32-LABEL: test6: ; KNL_32: # BB#0: -; KNL_32-NEXT: kxnorw %k0, %k0, %k1 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2 +; KNL_32-NEXT: kxnorw %k0, %k0, %k1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k2 ; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2} ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1} @@ -1575,7 +1575,7 @@ ; Check non-power-of-2 case. It should be scalarized. declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { -; ALL-LABEL: test30: +; ALL-LABEL: test30 ; ALL-NOT: gather %sext_ind = sext <3 x i32> %ind to <3 x i64> @@ -1691,12 +1691,12 @@ ; KNL_32-LABEL: test_gather_16i64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Lcfi4: +; KNL_32-NEXT: .Lcfi0: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Lcfi5: +; KNL_32-NEXT: .Lcfi1: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Lcfi6: +; KNL_32-NEXT: .Lcfi2: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -1814,12 +1814,12 @@ ; KNL_32-LABEL: test_gather_16f64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Lcfi7: +; KNL_32-NEXT: .Lcfi3: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Lcfi8: +; KNL_32-NEXT: .Lcfi4: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Lcfi9: +; KNL_32-NEXT: .Lcfi5: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -1936,12 +1936,12 @@ ; KNL_32-LABEL: test_scatter_16i64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Lcfi10: +; KNL_32-NEXT: .Lcfi6: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Lcfi11: +; KNL_32-NEXT: .Lcfi7: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Lcfi12: +; KNL_32-NEXT: .Lcfi8: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -2058,12 +2058,12 @@ ; KNL_32-LABEL: test_scatter_16f64: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Lcfi13: +; KNL_32-NEXT: .Lcfi9: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Lcfi14: +; KNL_32-NEXT: .Lcfi10: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Lcfi15: +; KNL_32-NEXT: .Lcfi11: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-64, %esp ; KNL_32-NEXT: subl $64, %esp @@ -2139,12 +2139,12 @@ ; KNL_32-LABEL: test_pr28312: ; KNL_32: # BB#0: ; KNL_32-NEXT: pushl %ebp -; KNL_32-NEXT: .Lcfi16: +; KNL_32-NEXT: .Lcfi12: ; KNL_32-NEXT: .cfi_def_cfa_offset 8 -; KNL_32-NEXT: .Lcfi17: +; KNL_32-NEXT: .Lcfi13: ; KNL_32-NEXT: .cfi_offset %ebp, -8 ; KNL_32-NEXT: movl %esp, %ebp -; KNL_32-NEXT: .Lcfi18: +; KNL_32-NEXT: .Lcfi14: ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: andl $-32, %esp ; KNL_32-NEXT: subl $32, %esp Index: test/CodeGen/X86/pr27591.ll =================================================================== --- test/CodeGen/X86/pr27591.ll +++ test/CodeGen/X86/pr27591.ll @@ -9,12 +9,6 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: setne %al -; CHECK-NEXT: # implicit-def: %EDI -; CHECK-NEXT: movb %al, %dil -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovd %edi, %k0 -; CHECK-NEXT: kmovd %k0, %edi -; CHECK-NEXT: movb %dil, %al ; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: movzbl %al, %edi ; CHECK-NEXT: callq callee1 @@ -32,17 +26,9 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: setne %al -; CHECK-NEXT: # implicit-def: %EDI -; CHECK-NEXT: movb %al, %dil -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovd %edi, %k0 -; CHECK-NEXT: kmovd %k0, %edi +; CHECK-NEXT: movzbl %al, %edi ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movb %dil, %al -; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: movl $-1, %ecx -; CHECK-NEXT: cmovnel %ecx, %edi +; CHECK-NEXT: negl %edi ; CHECK-NEXT: callq callee2 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq Index: test/CodeGen/X86/pr28173.ll =================================================================== --- test/CodeGen/X86/pr28173.ll +++ test/CodeGen/X86/pr28173.ll @@ -8,9 +8,8 @@ define i64 @foo64(i1 zeroext %i) #0 { ; CHECK-LABEL: foo64: ; CHECK: # BB#0: -; CHECK-NEXT: # kill: %EDI %EDI %RDI -; CHECK-NEXT: orq $-2, %rdi -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: orq $-2, %rax ; CHECK-NEXT: retq br label %bb @@ -26,8 +25,9 @@ define i16 @foo16(i1 zeroext %i) #0 { ; CHECK-LABEL: foo16: ; CHECK: # BB#0: -; CHECK-NEXT: orl $65534, %edi # imm = 0xFFFE -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: orl $65534, %eax # imm = 0xFFFE +; CHECK-NEXT: # kill: %AX %AX %EAX ; CHECK-NEXT: retq br label %bb @@ -43,9 +43,9 @@ define i16 @foo16_1(i1 zeroext %i, i32 %j) #0 { ; CHECK-LABEL: foo16_1: ; CHECK: # BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: orl $2, %edi -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: orl $2, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX ; CHECK-NEXT: retq br label %bb @@ -61,8 +61,8 @@ define i32 @foo32(i1 zeroext %i) #0 { ; CHECK-LABEL: foo32: ; CHECK: # BB#0: -; CHECK-NEXT: orl $-2, %edi -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: orl $-2, %eax ; CHECK-NEXT: retq br label %bb Index: test/CodeGen/X86/pr32241.ll =================================================================== --- test/CodeGen/X86/pr32241.ll +++ test/CodeGen/X86/pr32241.ll @@ -4,49 +4,59 @@ define i32 @_Z3foov() { ; CHECK-LABEL: _Z3foov: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: subl $20, %esp +; CHECK-NEXT: pushl %esi ; CHECK-NEXT: .Lcfi0: -; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: subl $24, %esp +; CHECK-NEXT: .Lcfi1: +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .Lcfi2: +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: movw $10959, {{[0-9]+}}(%esp) # imm = 0x2ACF ; CHECK-NEXT: movw $-15498, {{[0-9]+}}(%esp) # imm = 0xC376 ; CHECK-NEXT: movw $19417, {{[0-9]+}}(%esp) # imm = 0x4BD9 -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %cx -; CHECK-NEXT: kxnorw %k0, %k0, %k0 -; CHECK-NEXT: kshiftrw $15, %k0, %k0 -; CHECK-NEXT: testw %cx, %cx -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: cmpw $0, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp) # 1-byte Spill ; CHECK-NEXT: jne .LBB0_2 -; CHECK-NEXT: jmp .LBB0_1 -; CHECK-NEXT: .LBB0_1: # %lor.rhs +; CHECK-NEXT: # BB#1: # %lor.rhs ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill +; CHECK-NEXT: kmovb %k0, %ecx +; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill ; CHECK-NEXT: jmp .LBB0_2 ; CHECK-NEXT: .LBB0_2: # %lor.end -; CHECK-NEXT: kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload -; CHECK-NEXT: kxnorw %k0, %k0, %k1 -; CHECK-NEXT: kshiftrw $15, %k1, %k1 -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill -; CHECK-NEXT: kmovw %k1, {{[0-9]+}}(%esp) # 2-byte Spill +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al # 1-byte Reload +; CHECK-NEXT: movb $1, %cl +; CHECK-NEXT: andb $1, %al +; CHECK-NEXT: movzbl %al, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload +; CHECK-NEXT: subl %edx, %esi +; CHECK-NEXT: setl %al +; CHECK-NEXT: andb $1, %al +; CHECK-NEXT: movzbl %al, %edx +; CHECK-NEXT: xorl $-1, %edx +; CHECK-NEXT: cmpl $0, %edx +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill +; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill ; CHECK-NEXT: jne .LBB0_4 -; CHECK-NEXT: jmp .LBB0_3 -; CHECK-NEXT: .LBB0_3: # %lor.rhs4 +; CHECK-NEXT: # BB#3: # %lor.rhs4 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill +; CHECK-NEXT: kmovb %k0, %ecx +; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill ; CHECK-NEXT: jmp .LBB0_4 ; CHECK-NEXT: .LBB0_4: # %lor.end5 -; CHECK-NEXT: kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: movw %ax, %cx -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al # 1-byte Reload +; CHECK-NEXT: andb $1, %al +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: movw %cx, %dx +; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp) ; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl $20, %esp +; CHECK-NEXT: addl $24, %esp +; CHECK-NEXT: popl %esi ; CHECK-NEXT: retl entry: %aa = alloca i16, align 2 Index: test/CodeGen/X86/pr32256.ll =================================================================== --- test/CodeGen/X86/pr32256.ll +++ test/CodeGen/X86/pr32256.ll @@ -7,39 +7,29 @@ define void @_Z1av() { ; CHECK-LABEL: _Z1av: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: subl $6, %esp +; CHECK-NEXT: subl $2, %esp ; CHECK-NEXT: .Lcfi0: -; CHECK-NEXT: .cfi_def_cfa_offset 10 +; CHECK-NEXT: .cfi_def_cfa_offset 6 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: kmovd %eax, %k0 ; CHECK-NEXT: movb c, %cl -; CHECK-NEXT: # implicit-def: %EAX -; CHECK-NEXT: movb %cl, %al -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: kmovq %k1, %k2 -; CHECK-NEXT: kxnorw %k0, %k0, %k3 -; CHECK-NEXT: kshiftrw $15, %k3, %k3 -; CHECK-NEXT: kxorw %k3, %k1, %k1 -; CHECK-NEXT: kmovd %k1, %eax -; CHECK-NEXT: movb %al, %cl +; CHECK-NEXT: xorb $-1, %cl ; CHECK-NEXT: testb $1, %cl -; CHECK-NEXT: kmovw %k2, {{[0-9]+}}(%esp) # 2-byte Spill -; CHECK-NEXT: kmovw %k0, (%esp) # 2-byte Spill +; CHECK-NEXT: kmovb %k0, %ecx +; CHECK-NEXT: movb %cl, (%esp) # 1-byte Spill ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: jmp .LBB0_2 ; CHECK-NEXT: .LBB0_1: # %land.rhs ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: kmovd %eax, %k0 -; CHECK-NEXT: kmovw %k0, (%esp) # 2-byte Spill +; CHECK-NEXT: kmovb %k0, %ecx +; CHECK-NEXT: movb %cl, (%esp) # 1-byte Spill ; CHECK-NEXT: jmp .LBB0_2 ; CHECK-NEXT: .LBB0_2: # %land.end -; CHECK-NEXT: kmovw (%esp), %k0 # 2-byte Reload -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: movb %al, %cl -; CHECK-NEXT: andb $1, %cl -; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) -; CHECK-NEXT: addl $6, %esp +; CHECK-NEXT: movb (%esp), %al # 1-byte Reload +; CHECK-NEXT: andb $1, %al +; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp) +; CHECK-NEXT: addl $2, %esp ; CHECK-NEXT: retl entry: %b = alloca i8, align 1 Index: test/CodeGen/X86/pr32284.ll =================================================================== --- test/CodeGen/X86/pr32284.ll +++ test/CodeGen/X86/pr32284.ll @@ -40,12 +40,6 @@ ; X86-O0-NEXT: movzbl %cl, %edx ; X86-O0-NEXT: subl %eax, %edx ; X86-O0-NEXT: setle %cl -; X86-O0-NEXT: # implicit-def: %EAX -; X86-O0-NEXT: movb %cl, %al -; X86-O0-NEXT: andl $1, %eax -; X86-O0-NEXT: kmovd %eax, %k0 -; X86-O0-NEXT: kmovd %k0, %eax -; X86-O0-NEXT: movb %al, %cl ; X86-O0-NEXT: andb $1, %cl ; X86-O0-NEXT: movzbl %cl, %eax ; X86-O0-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -80,12 +74,6 @@ ; X64-O0-NEXT: movzbl %sil, %edi ; X64-O0-NEXT: subl %eax, %edi ; X64-O0-NEXT: setle %dl -; X64-O0-NEXT: # implicit-def: %EAX -; X64-O0-NEXT: movb %dl, %al -; X64-O0-NEXT: andl $1, %eax -; X64-O0-NEXT: kmovd %eax, %k0 -; X64-O0-NEXT: kmovd %k0, %eax -; X64-O0-NEXT: movb %al, %dl ; X64-O0-NEXT: andb $1, %dl ; X64-O0-NEXT: movzbl %dl, %eax ; X64-O0-NEXT: movl %eax, -{{[0-9]+}}(%rsp) Index: test/CodeGen/X86/pr32451.ll =================================================================== --- test/CodeGen/X86/pr32451.ll +++ test/CodeGen/X86/pr32451.ll @@ -25,12 +25,6 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload ; CHECK-NEXT: movl 4(%ecx), %edx ; CHECK-NEXT: movb (%edx), %bl -; CHECK-NEXT: # implicit-def: %EDX -; CHECK-NEXT: movb %bl, %dl -; CHECK-NEXT: andl $1, %edx -; CHECK-NEXT: kmovw %edx, %k0 -; CHECK-NEXT: kmovw %k0, %edx -; CHECK-NEXT: movb %dl, %bl ; CHECK-NEXT: andb $1, %bl ; CHECK-NEXT: movzbl %bl, %edx ; CHECK-NEXT: movl %edx, (%esp) Index: test/CodeGen/X86/sse-scalar-fp-arith.ll =================================================================== --- test/CodeGen/X86/sse-scalar-fp-arith.ll +++ test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1119,9 +1119,9 @@ ; ; AVX512-LABEL: add_ss_mask: ; AVX512: # BB#0: -; AVX512-NEXT: andl $1, %edi +; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1} +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} ; AVX512-NEXT: vmovaps %xmm2, %xmm0 ; AVX512-NEXT: retq %1 = extractelement <4 x float> %a, i64 0 @@ -1174,9 +1174,9 @@ ; ; AVX512-LABEL: add_sd_mask: ; AVX512: # BB#0: -; AVX512-NEXT: andl $1, %edi +; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: kmovw %edi, %k1 -; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1} +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} ; AVX512-NEXT: vmovapd %xmm2, %xmm0 ; AVX512-NEXT: retq %1 = extractelement <2 x double> %a, i64 0 Index: test/CodeGen/X86/xmulo.ll =================================================================== --- test/CodeGen/X86/xmulo.ll +++ test/CodeGen/X86/xmulo.ll @@ -712,17 +712,11 @@ ; ; KNL-LABEL: bug27873: ; KNL: ## BB#0: -; KNL-NEXT: andl $1, %esi ; KNL-NEXT: movl $160, %ecx ; KNL-NEXT: movq %rdi, %rax ; KNL-NEXT: mulq %rcx -; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: seto %al -; KNL-NEXT: andl $1, %eax -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: korw %k1, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: ## kill: %AL %AL %EAX +; KNL-NEXT: orb %sil, %al ; KNL-NEXT: retq %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160) %mul.overflow = extractvalue { i64, i1 } %mul, 1 Index: test/CodeGen/X86/xor-select-i1-combine.ll =================================================================== --- test/CodeGen/X86/xor-select-i1-combine.ll +++ test/CodeGen/X86/xor-select-i1-combine.ll @@ -7,10 +7,10 @@ define i32 @main(i8 %small) { ; CHECK-LABEL: main: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movl $n, %eax -; CHECK-NEXT: movl $m, %ecx ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: cmovneq %rax, %rcx +; CHECK-NEXT: movl $m, %eax +; CHECK-NEXT: movl $n, %ecx +; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: movl (%rcx), %eax ; CHECK-NEXT: retq entry: Index: test/TableGen/intrinsic-varargs.td =================================================================== --- test/TableGen/intrinsic-varargs.td +++ test/TableGen/intrinsic-varargs.td @@ -23,7 +23,7 @@ } // isVoid needs to match the definition in ValueTypes.td -def isVoid : ValueType<0, 66>; // Produces no value +def isVoid : ValueType<0, 67>; // Produces no value def llvm_vararg_ty : LLVMType; // this means vararg here // CHECK: /* 0 */ 0, 29, 0, Index: utils/TableGen/CodeGenTarget.cpp =================================================================== --- utils/TableGen/CodeGenTarget.cpp +++ utils/TableGen/CodeGenTarget.cpp @@ -75,6 +75,7 @@ case MVT::x86mmx: return "MVT::x86mmx"; case MVT::Glue: return "MVT::Glue"; case MVT::isVoid: return "MVT::isVoid"; + case MVT::v1i1: return "MVT::v1i1"; case MVT::v2i1: return "MVT::v2i1"; case MVT::v4i1: return "MVT::v4i1"; case MVT::v8i1: return "MVT::v8i1";