diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 411a97129d4b7..a06ba959917d8 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -648,6 +648,7 @@ class CodeGen final : public CodeGenInterface #if defined(TARGET_AMD64) void genAmd64EmitterUnitTestsSse2(); + void genAmd64EmitterUnitTestsApx(); #endif #endif // defined(DEBUG) diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 1d6ac2301c45d..641267f686058 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -2698,6 +2698,10 @@ void CodeGen::genEmitterUnitTests() { genAmd64EmitterUnitTestsSse2(); } + if (unitTestSectionAll || (strstr(unitTestSection, "apx") != nullptr)) + { + genAmd64EmitterUnitTestsApx(); + } #elif defined(TARGET_ARM64) if (unitTestSectionAll || (strstr(unitTestSection, "general") != nullptr)) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 6c23977f83a09..bd80187eea6a4 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -9053,6 +9053,225 @@ void CodeGen::genAmd64EmitterUnitTestsSse2() GetEmitter()->emitIns_R_R_R(INS_cvtsd2ss, EA_8BYTE, REG_XMM0, REG_XMM1, REG_XMM2); } +/***************************************************************************** + * Unit tests for the APX instructions. + */ + +void CodeGen::genAmd64EmitterUnitTestsApx() +{ + emitter* theEmitter = GetEmitter(); + + genDefineTempLabel(genCreateTempLabel()); + + // This test suite needs REX2 enabled. + if (!theEmitter->UseRex2Encoding() && !theEmitter->emitComp->DoJitStressRex2Encoding()) + { + return; + } + + theEmitter->emitIns_R_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX); + + theEmitter->emitIns_R_R(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX); + + theEmitter->emitIns_Mov(INS_mov, EA_4BYTE, REG_EAX, REG_ECX, false); + theEmitter->emitIns_Mov(INS_movsx, EA_2BYTE, REG_EAX, REG_ECX, false); + theEmitter->emitIns_Mov(INS_movzx, EA_2BYTE, REG_EAX, REG_ECX, false); + + theEmitter->emitIns_R_R(INS_popcnt, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_lzcnt, EA_4BYTE, REG_EAX, REG_ECX); + theEmitter->emitIns_R_R(INS_tzcnt, EA_4BYTE, REG_EAX, REG_ECX); + + theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_add, EA_2BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_adc, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_sbb, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_cmp, EA_4BYTE, REG_EAX, 0x05); + theEmitter->emitIns_R_I(INS_test, EA_4BYTE, REG_EAX, 0x05); + + theEmitter->emitIns_R_I(INS_mov, EA_4BYTE, REG_EAX, 0xE0); + + // JIT tend to compress imm64 to imm32 if higher half is all-zero, make sure this test checks the path for imm64. + theEmitter->emitIns_R_I(INS_mov, EA_8BYTE, REG_RAX, 0xFFFF000000000000); + + // shf reg, cl + theEmitter->emitIns_R(INS_rol, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_ror, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_rcl, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_rcr, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_shl, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_shr, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_sar, EA_4BYTE, REG_EAX); + + // shf reg, 1 + theEmitter->emitIns_R(INS_rol_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_ror_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_rcl_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_rcr_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_shl_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_shr_1, EA_4BYTE, REG_EAX); + theEmitter->emitIns_R(INS_sar_1, EA_4BYTE, REG_EAX); + + // shf reg, imm8 + theEmitter->emitIns_R_I(INS_shl_N, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_shr_N, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_sar_N, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_rol_N, EA_4BYTE, REG_ECX, 0x05); + theEmitter->emitIns_R_I(INS_ror_N, EA_4BYTE, REG_ECX, 0x05); + // TODO-xarch-apx: not enable these 2 for now. + // theEmitter->emitIns_R_I(INS_rcl_N, EA_4BYTE, REG_ECX, 0x05); + // theEmitter->emitIns_R_I(INS_rcr_N, EA_4BYTE, REG_ECX, 0x05); + + theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_EAX); + theEmitter->emitIns_R(INS_not, EA_2BYTE, REG_EAX); + + theEmitter->emitIns_R_AR(INS_lea, EA_4BYTE, REG_ECX, REG_EAX, 4); + + theEmitter->emitIns_R_AR(INS_mov, EA_1BYTE, REG_ECX, REG_EAX, 4); + theEmitter->emitIns_R_AR(INS_mov, EA_2BYTE, REG_ECX, REG_EAX, 4); + theEmitter->emitIns_R_AR(INS_mov, EA_4BYTE, REG_ECX, REG_EAX, 4); + theEmitter->emitIns_R_AR(INS_mov, EA_8BYTE, REG_ECX, REG_EAX, 4); + + theEmitter->emitIns_R_AR(INS_add, EA_1BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_add, EA_2BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_add, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_add, EA_8BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_or, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_adc, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_and, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_sub, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_xor, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_test, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_bsf, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_bsr, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_popcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_lzcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_tzcnt, EA_4BYTE, REG_EAX, REG_ECX, 4); + + theEmitter->emitIns_AR_R(INS_add, EA_1BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_add, EA_2BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_add, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_add, EA_8BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_or, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_adc, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_sbb, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_and, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_sub, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_xor, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_cmp, EA_4BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_AR_R(INS_test, EA_4BYTE, REG_EAX, REG_ECX, 4); + + theEmitter->emitIns_R_AR(INS_movsx, EA_2BYTE, REG_ECX, REG_EAX, 4); + theEmitter->emitIns_R_AR(INS_movzx, EA_2BYTE, REG_EAX, REG_ECX, 4); + theEmitter->emitIns_R_AR(INS_cmovo, EA_4BYTE, REG_EAX, REG_ECX, 4); + + theEmitter->emitIns_AR_R(INS_xadd, EA_4BYTE, REG_EAX, REG_EDX, 2); + + theEmitter->emitIns_R_R_I(INS_shld, EA_4BYTE, REG_EAX, REG_ECX, 5); + theEmitter->emitIns_R_R_I(INS_shrd, EA_2BYTE, REG_EAX, REG_ECX, 5); + // TODO-XArch-apx: S_R_I path only accepts SEE or VEX instructions, + // so I assuem shld/shrd will not be taking the first argument from stack. + // theEmitter->emitIns_S_R_I(INS_shld, EA_2BYTE, 1, 2, REG_EAX, 5); + // theEmitter->emitIns_S_R_I(INS_shrd, EA_2BYTE, 1, 2, REG_EAX, 5); + + theEmitter->emitIns_AR_R(INS_cmpxchg, EA_2BYTE, REG_EAX, REG_EDX, 2); + + theEmitter->emitIns_R(INS_seto, EA_1BYTE, REG_EDX); + + theEmitter->emitIns_R(INS_bswap, EA_8BYTE, REG_EDX); + + // INS_bt only has reg-to-reg form. + theEmitter->emitIns_R_R(INS_bt, EA_2BYTE, REG_EAX, REG_EDX); + + theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_EDX); + + theEmitter->emitIns_R_R(INS_xchg, EA_8BYTE, REG_EAX, REG_EDX); + + theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_EDX); + theEmitter->emitIns_R(INS_mulEAX, EA_8BYTE, REG_EDX); + + GenTreePhysReg physReg(REG_EDX); + physReg.SetRegNum(REG_EDX); + GenTreeIndir load = indirForm(TYP_INT, &physReg); + + theEmitter->emitIns_R_A(INS_add, EA_1BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_add, EA_2BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_add, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_add, EA_8BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_or, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_adc, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_sbb, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_and, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_sub, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_xor, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_cmp, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_test, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_bsf, EA_4BYTE, REG_EAX, &load); + theEmitter->emitIns_R_A(INS_bsr, EA_4BYTE, REG_EAX, &load); + + // Note: + // All the tests below rely on the runtime status of the stack this unit tests attaching to, + // it might fail due to stack value unavailable/mismatch, since these tests are mainly for + // encoding correctness check, this kind of failures may be considered as not harmful. + + theEmitter->emitIns_R_S(INS_add, EA_1BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_add, EA_2BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_add, EA_8BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_adc, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_sbb, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_cmp, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_test, EA_4BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_S_R(INS_xadd, EA_2BYTE, REG_EAX, 0, 0); + + theEmitter->emitIns_S_I(INS_shl_N, EA_4BYTE, 0, 0, 4); + theEmitter->emitIns_S(INS_shl_1, EA_4BYTE, 0, 4); + + theEmitter->emitIns_R_S(INS_movsx, EA_2BYTE, REG_ECX, 0, 0); + theEmitter->emitIns_R_S(INS_movzx, EA_2BYTE, REG_EAX, 0, 0); + theEmitter->emitIns_R_S(INS_cmovo, EA_4BYTE, REG_EAX, 0, 0); + + theEmitter->emitIns_R(INS_pop, EA_PTRSIZE, REG_EAX); + theEmitter->emitIns_R(INS_push, EA_PTRSIZE, REG_EAX); + theEmitter->emitIns_R(INS_pop_hide, EA_PTRSIZE, REG_EAX); + theEmitter->emitIns_R(INS_push_hide, EA_PTRSIZE, REG_EAX); + + theEmitter->emitIns_S(INS_pop, EA_PTRSIZE, 0, 0); + theEmitter->emitIns_I(INS_push, EA_PTRSIZE, 50); + + theEmitter->emitIns_R(INS_inc, EA_4BYTE, REG_EAX); + theEmitter->emitIns_AR(INS_inc, EA_2BYTE, REG_EAX, 2); + theEmitter->emitIns_S(INS_inc, EA_2BYTE, 0, 0); + theEmitter->emitIns_R(INS_dec, EA_4BYTE, REG_EAX); + theEmitter->emitIns_AR(INS_dec, EA_2BYTE, REG_EAX, 2); + theEmitter->emitIns_S(INS_dec, EA_2BYTE, 0, 0); + + theEmitter->emitIns_S(INS_neg, EA_2BYTE, 0, 0); + theEmitter->emitIns_S(INS_not, EA_2BYTE, 0, 0); +} + #endif // defined(DEBUG) && defined(TARGET_AMD64) #ifdef PROFILING_SUPPORTED diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index e2bae6d02fd67..a3d854e17eff0 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2295,7 +2295,10 @@ void Compiler::compSetProcessor() if (canUseEvexEncoding()) { codeGen->GetEmitter()->SetUseEvexEncoding(true); - // TODO-XArch-AVX512 : Revisit other flags to be set once avx512 instructions are added. + } + if (canUseApxEncoding()) + { + codeGen->GetEmitter()->SetUseRex2Encoding(true); } } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 5e05ff24cf1b2..5160d287cb311 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9921,6 +9921,17 @@ class Compiler return (compOpportunisticallyDependsOn(InstructionSet_EVEX)); } + //------------------------------------------------------------------------ + // canUseRex2Encoding - Answer the question: Is Rex2 encoding supported on this target. + // + // Returns: + // `true` if Rex2 encoding is supported, `false` if not. + // + bool canUseApxEncoding() const + { + return compOpportunisticallyDependsOn(InstructionSet_APX); + } + private: //------------------------------------------------------------------------ // DoJitStressEvexEncoding- Answer the question: Do we force EVEX encoding. @@ -9935,7 +9946,7 @@ class Compiler // otherwise use VEX encoding but can be EVEX encoded to use EVEX encoding // This requires AVX512F, AVX512BW, AVX512CD, AVX512DQ, and AVX512VL support - if (JitConfig.JitStressEvexEncoding() && IsBaselineVector512IsaSupportedOpportunistically()) + if (JitStressEvexEncoding() && IsBaselineVector512IsaSupportedOpportunistically()) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F_VL)); @@ -9948,14 +9959,49 @@ class Compiler return true; } - else if (JitConfig.JitStressEvexEncoding() && compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + else if (JitStressEvexEncoding() && compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) + { + return true; + } +#endif // DEBUG + + return false; + } + + //------------------------------------------------------------------------ + // DoJitStressRex2Encoding- Answer the question: Do we force REX2 encoding. + // + // Returns: + // `true` if user requests REX2 encoding. + // + bool DoJitStressRex2Encoding() const + { +#ifdef DEBUG + if (JitConfig.JitStressRex2Encoding() && compOpportunisticallyDependsOn(InstructionSet_APX)) { + // we should make sure EVEX is also stressed when REX2 is stressed, as we will need to guarantee EGPR + // functionality is properly turned on for every instructions when REX2 is stress. return true; } #endif // DEBUG return false; } + + //------------------------------------------------------------------------ + // JitStressEvexEncoding- Answer the question: Is Evex stress knob set + // + // Returns: + // `true` if user requests REX2 encoding. + // + bool JitStressEvexEncoding() const + { +#ifdef DEBUG + return JitConfig.JitStressEvexEncoding() || JitConfig.JitStressRex2Encoding(); +#endif // DEBUG + + return false; + } #endif // TARGET_XARCH /* diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 92d89a9e378f8..dc0f977b60862 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -470,6 +470,7 @@ class emitter #ifdef TARGET_XARCH SetUseVEXEncoding(false); SetUseEvexEncoding(false); + SetUseRex2Encoding(false); #endif // TARGET_XARCH emitDataSecCur = nullptr; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index e4f7b70182167..70f54f021c937 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -230,6 +230,12 @@ bool emitter::HasEvexEncoding(instruction ins) const return (flags & Encoding_EVEX) != 0; } +bool emitter::HasRex2Encoding(instruction ins) const +{ + insFlags flags = CodeGenInterface::instInfo[ins]; + return (flags & Encoding_REX2) != 0; +} + bool emitter::IsVexEncodableInstruction(instruction ins) const { if (!UseVEXEncoding()) @@ -269,6 +275,65 @@ bool emitter::IsEvexEncodableInstruction(instruction ins) const } } +//------------------------------------------------------------------------ +// IsRex2EncodableInstruction: Answer the question- Can this instruction be Rex2 encoded. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if ins can be Rex2 encoded. +// +bool emitter::IsRex2EncodableInstruction(instruction ins) const +{ + if (!UseRex2Encoding()) + { + return false; + } + return HasRex2Encoding(ins); +} + +//------------------------------------------------------------------------ +// IsLegacyMap1: Answer the question- Is this instruction on legacy-map-1 +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// `true` if ins is a legacy-map-1 instruction. +// +bool emitter::IsLegacyMap1(code_t code) const +{ +#ifdef TARGET_AMD64 + // Lagacy-Map-1 opcode is defined as 2-byte opcode with a leading byte of 0x0F, + // In JIT, it could be in the following style: + // 2-byte: XX0F + // 3-byte: 0F00XX + // 4-byte: 0FPP00XX + + if ((code & 0xFFFF00FF) == 0x0000000F) + { + // 2-byte + return true; + } + if ((code & 0xFFFF0000) == 0x000F0000) + { + // 3-byte + return true; + } + + if ((code & 0xFF00FF00) == 0x0F000000) + { + // 4-byte, need to check if PP is a prefix. + BYTE prefix = (BYTE)((code & 0xFF0000) >> 16); + return ((prefix == 0xF2) || (prefix == 0xF3) || (prefix == 0x66)); + } + + return false; +#endif // TARGET_AMD64 + return false; +} + //------------------------------------------------------------------------ // Answer the question: Is this a SIMD instruction. // @@ -1300,6 +1365,49 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const return false; } +//------------------------------------------------------------------------ +// TakesRex2Prefix: Checks if the instruction should be Rex2 encoded. +// +// Arguments: +// instruction -- processor instruction to check +// +// Return Value: +// true if this instruction requires a Rex2 prefix. +// +bool emitter::TakesRex2Prefix(const instrDesc* id) const +{ + // Return true iff the instruction supports REX2 encoding, and it requires to access EGPRs. + + // TODO-xarch-apx: + // At this stage, we are only using REX2 in the case that non-simd integer instructions + // with EGPRs being used in its operands, it could be either direct register uses, or + // memory addressing operands, i.e. index and base. + instruction ins = id->idIns(); + if (!IsRex2EncodableInstruction(ins)) + { + return false; + } + + if (TakesEvexPrefix(id)) + { + return false; + } + + if (HasExtendedGPReg(id)) + { + return true; + } + +#if defined(DEBUG) + if (emitComp->DoJitStressRex2Encoding()) + { + return true; + } +#endif // DEBUG + + return false; +} + // Intel AVX-512 encoding is defined in "Intel 64 and ia-32 architectures software developer's manual volume 2", Section // 2.6. // Add base EVEX prefix without setting W, R, X, or B bits @@ -1533,6 +1641,26 @@ emitter::code_t emitter::AddVexPrefix(instruction ins, code_t code, emitAttr att return code; } +// Add base REX2 prefix without setting higher register index bits +#define DEFAULT_2BYTE_REX2_PREFIX 0xD50000000000ULL +#define DEFAULT_2BYTE_REX2_PREFIX_MASK 0xFFFF00000000ULL +#define REX2_MAP1_PREFIX 0x008000000000ULL +emitter::code_t emitter::AddRex2Prefix(instruction ins, code_t code) +{ + assert(IsRex2EncodableInstruction(ins)); + + // Note that there are cases that some register field might be filled before adding prefix, + // So we don't check if the code has REX2 prefix already or not. + + code |= DEFAULT_2BYTE_REX2_PREFIX; + if (IsLegacyMap1(code)) // 2-byte opcode on Map-1 + { + code |= REX2_MAP1_PREFIX; + } + + return code; +} + // Returns true if this instruction, for the given EA_SIZE(attr), will require a REX.W prefix bool emitter::TakesRexWPrefix(const instrDesc* id) const { @@ -1675,6 +1803,36 @@ bool emitter::HasHighSIMDReg(const instrDesc* id) const return false; } +//------------------------------------------------------------------------ +// HasExtendedGPReg: Checks if an instruction uses an extended general-purpose register - EGPR (r16-r31) +// and will require one of the REX2 EGPR bits (REX2.R4/R3, REX2.B4/B3, REX2.X4/X3) +// +// Arguments: +// id -- instruction descriptor for encoding +// +// Return Value: +// true if instruction will require REX2 encoding for its register operands. +bool emitter::HasExtendedGPReg(const instrDesc* id) const +{ +#if defined(TARGET_AMD64) + // First check if addressing mode is used and if any of those uses eGPRs. + if (id->idHasMemAdr() && + (IsExtendedGPReg(id->idAddr()->iiaAddrMode.amBaseReg) || IsExtendedGPReg(id->idAddr()->iiaAddrMode.amIndxReg))) + { + return true; + } + + if ((id->idHasReg1() && IsExtendedGPReg(id->idReg1())) || (id->idHasReg2() && IsExtendedGPReg(id->idReg2())) || + (id->idHasReg3() && IsExtendedGPReg(id->idReg3())) || (id->idHasReg4() && IsExtendedGPReg(id->idReg4()))) + { + return true; + } + +#endif + // X86 JIT operates in 32-bit mode and hence extended reg are not available. + return false; +} + //------------------------------------------------------------------------ // HasMaskReg: Checks if an instruction uses a KMask registers (k0-k7) // @@ -1718,6 +1876,7 @@ bool emitter::HasMaskReg(const instrDesc* id) const bool IsExtendedReg(regNumber reg) { #ifdef TARGET_AMD64 + // TODO-XArch-apx: extend the gpr test, extended gprs should be from r8 to r31 after apx. return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM31)); #else // X86 JIT operates in 32-bit mode and hence extended reg are not available. @@ -1725,6 +1884,25 @@ bool IsExtendedReg(regNumber reg) #endif } +bool emitter::IsExtendedGPReg(regNumber reg) const +{ +// TODO-XArch-apx: +// Consider merge this method into IsExtendedReg(regNumber reg) +#ifdef TARGET_AMD64 + if (reg > REG_STK) + { + // not an actual reg + return false; + } + + // TODO-XArch-APX: + // we will eventually check EGPRs here: (reg >= REG_R16) && (reg <= REG_R31). + // revisit this part when LSRA is updated. + return false; +#endif + return false; +} + // Returns true if using this register, for the given EA_SIZE(attr), will require a REX.* prefix bool IsExtendedReg(regNumber reg, emitAttr attr) { @@ -1830,6 +2008,10 @@ emitter::code_t emitter::AddRexWPrefix(const instrDesc* id, code_t code) } } #ifdef TARGET_AMD64 + else if (hasRex2Prefix(code)) + { + return emitter::code_t(code | 0x000800000000ULL); + } return emitter::code_t(code | 0x4800000000ULL); #else assert(!"UNREACHED"); @@ -1864,6 +2046,13 @@ emitter::code_t emitter::AddRexRPrefix(const instrDesc* id, code_t code) return code & 0xFF7FFFFFFFFFFFULL; } } +#ifdef TARGET_AMD64 + else if (TakesRex2Prefix(id)) + { + assert(IsRex2EncodableInstruction(ins)); + return code |= 0xD50400000000ULL; // REX2.B3 + } +#endif // TARGET_AMD64 return code | 0x4400000000ULL; } @@ -1893,6 +2082,13 @@ emitter::code_t emitter::AddRexXPrefix(const instrDesc* id, code_t code) return code & 0xFFBFFFFFFFFFFFULL; } } +#ifdef TARGET_AMD64 + else if (TakesRex2Prefix(id)) + { + assert(IsRex2EncodableInstruction(ins)); + return code |= 0xD50200000000ULL; // REX2.B3 + } +#endif // TARGET_AMD64 return code | 0x4200000000ULL; } @@ -1922,6 +2118,13 @@ emitter::code_t emitter::AddRexBPrefix(const instrDesc* id, code_t code) return code & 0xFFDFFFFFFFFFFFULL; } } +#ifdef TARGET_AMD64 + else if (TakesRex2Prefix(id)) + { + assert(IsRex2EncodableInstruction(ins)); + return code |= 0xD50100000000ULL; // REX2.B3 + } +#endif // TARGET_AMD64 return code | 0x4100000000ULL; } @@ -2376,6 +2579,73 @@ unsigned emitter::emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst, return 3; } + else if (hasRex2Prefix(code)) + { + WORD rex2Prefix = (code >> 32) & 0xFFFF; + noway_assert(rex2Prefix >= 0xD500 && rex2Prefix <= 0xD5FF); + code &= 0x00000000FFFFFFFFLL; + int emittedSize = 0; + + if ((code & 0xFF) == 0x0F) + { + // some map-1 instructions have opcode in forms like: + // XX0F, remove the leading 0x0F byte as it has been recorded in REX2. + code = code >> 8; + } + + BYTE check = (code >> 24) & 0xFF; + if (check == 0) + { + // 3-byte opcode: with the bytes ordered as 0x00113322 + // check for a prefix in the 11 position + check = (code >> 16) & 0xFF; + if (check != 0 && isPrefix(check)) + { + code &= 0x00000000FF00FFFFLL; + emittedSize += emitOutputByte(dst, check); + dst += 1; + } + + if (check == 0x0F) + { + // REX2 does not need 0F in the opcode. + code &= 0x00000000FF00FFFFLL; + } + } + else + { + // 4-byte opcode with the bytes ordered as 0x22114433 + // first check for a prefix in the 11 position + BYTE check2 = (code >> 16) & 0xFF; + if (isPrefix(check2)) + { + assert(!isPrefix(check)); // We currently don't use this, so it is untested + if (isPrefix(check)) + { + // 3 prefixes were rex = rr, check = c1, check2 = c2 encoded as 0xrrc1c2XXXX + // Change to c2rrc1XXXX, and emit check2 now + // code = (((code_t)prefix << 24) | ((code_t)check << 16) | (code & 0x0000FFFFLL)); + code &= 0x000000000000FFFFLL; + emittedSize += emitOutputByte(dst, check2); + dst += 1; + emittedSize += emitOutputByte(dst, check); + dst += 1; + } + else + { + // REX2 do not need the 0F prefix. + code &= 0x000000000000FFFFLL; + emittedSize += emitOutputByte(dst, check2); + dst += 1; + } + } + } + + emittedSize += emitOutputByte(dst, ((rex2Prefix >> 8) & 0xFF)); + emittedSize += emitOutputByte(dst + 1, (rex2Prefix & 0xFF)); + + return emittedSize; + } #ifdef TARGET_AMD64 if (code > 0x00FFFFFFFFLL) @@ -2478,6 +2748,25 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins) return 1; } +// Size of rex prefix in bytes +unsigned emitter::emitGetRexPrefixSize(instrDesc* id, instruction ins) +{ + // In case of AVX instructions, REX prefixes are part of VEX prefix. + // And hence requires no additional byte to encode REX prefixes. + if (IsVexOrEvexEncodableInstruction(ins)) + { + return 0; + } + + if (TakesRex2Prefix(id)) + { + return 0; + } + + // If not AVX, then we would need 1-byte to encode REX prefix. + return 1; +} + //------------------------------------------------------------------------ // emitGetEvexPrefixSize: Gets Size of EVEX prefix in bytes // @@ -2578,6 +2867,32 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const // The 4-Byte SSE instructions require one additional byte to hold the ModRM byte adjustedSize++; } +#ifdef TARGET_AMD64 + else if (IsRex2EncodableInstruction(ins)) + { + unsigned prefixAdjustedSize = 0; + if (TakesRex2Prefix(id)) + { + prefixAdjustedSize = 2; + // If the opcode will be prefixed by REX2, then all the map-1-legacy instructions can remove the escape + // prefix + if (IsLegacyMap1(code)) + { + prefixAdjustedSize -= 1; + } + } + + adjustedSize = prefixAdjustedSize; + + emitAttr attr = id->idOpSize(); + + if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx)) + { + // Most 16-bit operand instructions will need a 0x66 prefix. + adjustedSize++; + } + } +#endif // TARGET_AMD64 else { if (ins == INS_crc32) @@ -2619,6 +2934,12 @@ unsigned emitter::emitGetPrefixSize(instrDesc* id, code_t code, bool includeRexP return emitGetVexPrefixSize(id); } + if (hasRex2Prefix(code)) + { + assert(IsRex2EncodableInstruction(id->idIns())); + return 2; + } + if (includeRexPrefixSize && hasRexPrefix(code)) { return 1; @@ -3251,12 +3572,18 @@ inline unsigned emitter::insEncodeReg012(const instrDesc* id, regNumber reg, emi { *code = AddRexBPrefix(id, *code); // REX.B } + if (false /*reg >= REG_R16 && reg <= REG_R31*/) + { + // Seperate the encoding for REX2.B3/B4, REX2.B3 will be handled in `AddRexBPrefix`. + assert(TakesRex2Prefix(id)); + *code |= 0x001000000000ULL; // REX2.B4 + } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) { // We are assuming that we only use/encode SPL, BPL, SIL and DIL // not the corresponding AH, CH, DH, or BH - *code = AddRexPrefix(ins, *code); // REX + *code = hasRex2Prefix(*code) ? *code : AddRexPrefix(ins, *code); // REX } #endif // TARGET_AMD64 @@ -3294,12 +3621,18 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi { *code = AddRexRPrefix(id, *code); // REX.R } + if (false /*reg >= REG_R16 && reg <= REG_R31*/) + { + // seperate the encoding for REX2.R3/R4, REX2.R3 will be handled in `AddRexRPrefix`. + assert(TakesRex2Prefix(id)); + *code |= 0x004000000000ULL; // REX2.R4 + } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) { // We are assuming that we only use/encode SPL, BPL, SIL and DIL // not the corresponding AH, CH, DH, or BH - *code = AddRexPrefix(ins, *code); // REX + *code = hasRex2Prefix(*code) ? *code : AddRexPrefix(ins, *code); // REX } #endif // TARGET_AMD64 @@ -3397,7 +3730,13 @@ inline unsigned emitter::insEncodeRegSIB(const instrDesc* id, regNumber reg, cod } if (reg & 0x8) { - *code = AddRexXPrefix(id, *code); // REX.B + *code = AddRexXPrefix(id, *code); // REX.X + } + if (false /*reg >= REG_R16 && reg <= REG_R31*/) + { + // seperate the encoding for REX2.X3/X4, REX2.X3 will be handled in `AddRexXPrefix`. + assert(TakesRex2Prefix(id)); + *code |= 0x002000000000ULL; // REX2.X4 } } unsigned regBits = RegEncoding(reg); @@ -3696,7 +4035,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code) if (TakesRexWPrefix(id) || IsExtendedReg(id->idReg1(), attr) || IsExtendedReg(id->idReg2(), attr) || (!id->idIsSmallDsc() && (IsExtendedReg(id->idReg3(), attr) || IsExtendedReg(id->idReg4(), attr)))) { - sz += emitGetRexPrefixSize(ins); + sz += emitGetRexPrefixSize(id, ins); includeRexPrefixSize = !IsVexEncodableInstruction(ins); } @@ -3784,7 +4123,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) if ((TakesRexWPrefix(id) && ((ins != INS_xor) || (reg1 != reg2))) || IsExtendedReg(reg1, attr) || IsExtendedReg(reg2, attr)) { - sz += emitGetRexPrefixSize(ins); + sz += emitGetRexPrefixSize(id, ins); includeRexPrefixSize = false; } } @@ -3997,7 +4336,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var // REX prefix if (TakesRexWPrefix(id) || IsExtendedReg(id->idReg1(), attrSize) || IsExtendedReg(id->idReg2(), attrSize)) { - prefix += emitGetRexPrefixSize(ins); + prefix += emitGetRexPrefixSize(id, ins); } return prefix + emitInsSizeSVCalcDisp(id, code, var, dsp); @@ -4041,7 +4380,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSV(instrDesc* id, code_t code, int var // 64-bit operand instructions will need a REX.W prefix if (TakesRexWPrefix(id) || IsExtendedReg(id->idReg1(), attrSize) || IsExtendedReg(id->idReg2(), attrSize)) { - prefix += emitGetRexPrefixSize(ins); + prefix += emitGetRexPrefixSize(id, ins); } return prefix + valSize + emitInsSizeSVCalcDisp(id, code, var, dsp); @@ -4164,18 +4503,18 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) if (hasRexPrefix(code)) { // REX prefix - size += emitGetRexPrefixSize(ins); + size += emitGetRexPrefixSize(id, ins); } else if (TakesRexWPrefix(id)) { // REX.W prefix - size += emitGetRexPrefixSize(ins); + size += emitGetRexPrefixSize(id, ins); } else if (IsExtendedReg(reg, EA_PTRSIZE) || IsExtendedReg(rgx, EA_PTRSIZE) || ((ins != INS_call) && (IsExtendedReg(id->idReg1(), attrSize) || IsExtendedReg(id->idReg2(), attrSize)))) { // Should have a REX byte - size += emitGetRexPrefixSize(ins); + size += emitGetRexPrefixSize(id, ins); } if (rgx == REG_NA) @@ -4382,7 +4721,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeCV(instrDesc* id, code_t code) // 64-bit operand instructions will need a REX.W prefix if (TakesRexWPrefix(id) || IsExtendedReg(id->idReg1(), attrSize) || IsExtendedReg(id->idReg2(), attrSize)) { - size += emitGetRexPrefixSize(ins); + size += emitGetRexPrefixSize(id, ins); includeRexPrefixSize = false; } @@ -4639,7 +4978,7 @@ void emitter::emitIns(instruction ins, emitAttr attr) sz += emitGetAdjustedSize(id, code); if (TakesRexWPrefix(id)) { - sz += emitGetRexPrefixSize(ins); + sz += emitGetRexPrefixSize(id, ins); } id->idCodeSize(sz); @@ -5645,7 +5984,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) // REX byte if (IsExtendedReg(reg, attr) || TakesRexWPrefix(id)) { - sz += emitGetRexPrefixSize(ins); + sz += emitGetRexPrefixSize(id, ins); } id->idCodeSize(sz); @@ -5875,7 +6214,7 @@ void emitter::emitIns_R_I(instruction ins, // register. So we also need to check if that built-in register is an extended register. if (IsExtendedReg(reg, attr) || TakesRexWPrefix(id) || instrIsExtendedReg3opImul(ins)) { - sz += emitGetRexPrefixSize(ins); + sz += emitGetRexPrefixSize(id, ins); } id->idCodeSize(sz); @@ -5934,6 +6273,8 @@ void emitter::emitIns_I(instruction ins, emitAttr attr, cnsval_ssize_t val) id = emitNewInstrSC(attr, val); id->idIns(ins); id->idInsFmt(IF_CNS); + + sz += emitGetAdjustedSize(id, insCodeMI(ins)); id->idCodeSize(sz); dispIns(id); @@ -6020,7 +6361,7 @@ void emitter::emitIns_C(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE fld if (TakesRexWPrefix(id)) { // REX.W prefix - sz += emitGetRexPrefixSize(ins); + sz += emitGetRexPrefixSize(id, ins); } id->idAddr()->iiaFieldHnd = fldHnd; @@ -7582,7 +7923,7 @@ void emitter::emitIns_C_R(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE f // REX prefix if (TakesRexWPrefix(id) || IsExtendedReg(reg, attr)) { - sz += emitGetRexPrefixSize(ins); + sz += emitGetRexPrefixSize(id, ins); } } else @@ -9911,6 +10252,11 @@ void emitter::emitIns_Call(EmitCallType callType, id->idAddr()->iiaAddr = (BYTE*)addr; sz = 6; + if (TakesRex2Prefix(id)) + { + sz += 2; + } + // Since this is an indirect call through a pointer and we don't // currently pass in emitAttr into this function, we query codegen // whether addr needs a reloc. @@ -12869,6 +13215,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // For INS_call the instruction size is actually the return value size if ((ins == INS_call) || (ins == INS_tail_i_jmp)) { + code = AddX86PrefixIfNeeded(id, code, size); if (ins == INS_tail_i_jmp) { // tail call with addressing mode (or through register) needs rex.w @@ -12967,7 +13314,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Emit SIMD prefix if required // There are some callers who already add SIMD prefix and call this routine. // Therefore, add SIMD prefix is one is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); + code = AddX86PrefixIfNeededAndNotPresent(id, code, size); // For this format, moves do not support a third operand, so we only need to handle the binary ops. if (TakesSimdPrefix(id)) @@ -13884,7 +14231,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Add VEX or EVEX prefix if required. // There are some callers who already add prefix and call this routine. // Therefore, add VEX or EVEX prefix if one is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); + code = AddX86PrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix if (TakesRexWPrefix(id)) @@ -14353,7 +14700,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Compute VEX/EVEX prefix // Some of its callers already add EVEX/VEX prefix and then call this routine. // Therefore add EVEX/VEX prefix is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); + code = AddX86PrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix if (TakesRexWPrefix(id)) @@ -14853,6 +15200,11 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code |= 0x1; } + if (TakesRex2Prefix(id)) + { + code = AddRex2Prefix(ins, code); + } + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); @@ -14885,6 +15237,13 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(size == EA_PTRSIZE); code = insEncodeOpreg(id, reg, size); + // TODO-xarch-apx: it is TBD if we will use REX2 for PUSH/POP, + // the implementation here is optional. + if (TakesRex2Prefix(id)) + { + code = AddRex2Prefix(ins, code); + } + assert(!TakesSimdPrefix(id)); assert(!TakesRexWPrefix(id)); @@ -14905,6 +15264,11 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code = insCodeRR(ins); + if (TakesRex2Prefix(id)) + { + code = AddRex2Prefix(ins, code); + } + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); @@ -14916,7 +15280,9 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); - dst += emitOutputWord(dst, code | (regcode << 8)); + // If the instruction is prefixed by REX2, then 0x0F has been removed, just need to emit the opcode byte. + dst += + TakesRex2Prefix(id) ? emitOutputByte(dst, code | regcode) : emitOutputWord(dst, code | (regcode << 8)); break; } @@ -14942,15 +15308,22 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code = insEncodeMRreg(id, reg, EA_1BYTE, insCodeMR(ins)); - // Output the REX prefix - dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); - - // We expect this to always be a 'big' opcode - assert(code & 0x00FF0000); - - dst += emitOutputByte(dst, code >> 16); - dst += emitOutputWord(dst, code & 0x0000FFFF); + if (TakesRex2Prefix(id)) + { + code = AddRex2Prefix(ins, code); + dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); + dst += emitOutputWord(dst, code & 0x0000FFFF); + } + else + { + // Output the REX prefix + dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); + // We expect this to always be a 'big' opcode + assert(code & 0x00FF0000); + dst += emitOutputByte(dst, code >> 16); + dst += emitOutputWord(dst, code & 0x0000FFFF); + } break; case INS_mulEAX: @@ -14980,7 +15353,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) } } - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); if (TakesRexWPrefix(id)) { @@ -15084,7 +15457,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { code = insCodeMR(ins); } - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeRMreg(id, code); if (TakesRexWPrefix(id)) @@ -15096,7 +15469,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeRMreg(id, code) | (int)(size == EA_2BYTE); #ifdef TARGET_AMD64 @@ -15110,8 +15483,9 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeRMreg(id, code); + code = AddRexWPrefix(id, code); #endif // TARGET_AMD64 } @@ -15121,7 +15495,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeRMreg(id, code); if ((ins == INS_crc32) && (size > EA_1BYTE)) { @@ -15143,6 +15517,10 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert(!TakesSimdPrefix(id)); code = insCodeMR(ins); + if (TakesRex2Prefix(id)) + { + code = AddRex2Prefix(ins, code); + } code = insEncodeMRreg(id, code); if (ins != INS_test) @@ -15479,7 +15857,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) emitAttr size = id->idOpSize(); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeRMreg(id, code); @@ -15597,7 +15975,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // Get the 'base' opcode. code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeMIreg(id, reg, size, code); assert(code & 0x00FF0000); if (TakesSimdPrefix(id)) @@ -15641,6 +16019,8 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // This is INS_mov and will not take VEX prefix assert(!TakesVexPrefix(ins)); + code = AddX86PrefixIfNeededAndNotPresent(id, code, size); + if (TakesRexWPrefix(id)) { code = AddRexWPrefix(id, code); @@ -15735,6 +16115,9 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) { assert(!useSigned); code = insCodeACC(ins); + // TODO-xarch-apx: + // There is a case that instructions with 0xA* opcode cannot be prefixed by REX2. + code = (ins == INS_test) ? code : AddX86PrefixIfNeeded(id, code, size); } else { @@ -15749,7 +16132,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) else { code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeMIreg(id, reg, size, code); } } @@ -15976,15 +16359,27 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id) case INS_push: code = insCodeMI(ins); + // TODO-xarch-apx: it is TBD if we will use REX2 for PUSH/POP, + // the implementation here is optional. + if (TakesRex2Prefix(id)) + { + code = AddRex2Prefix(ins, code); + } + // Does the operand fit in a byte? if (valInByte) { + if (TakesRex2Prefix(id)) + { + dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); + } + dst += emitOutputByte(dst, code | 2); dst += emitOutputByte(dst, val); } else { - if (TakesRexWPrefix(id)) + if (TakesRexWPrefix(id) || TakesRex2Prefix(id)) { code = AddRexWPrefix(id, code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -16732,7 +17127,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) #ifdef TARGET_AMD64 // Support only scalar AVX instructions and hence size is hard coded to 4-byte. - code = AddSimdPrefixIfNeeded(id, code, EA_4BYTE); + code = AddX86PrefixIfNeeded(id, code, EA_4BYTE); if (((ins == INS_cdq) || (ins == INS_cwde)) && TakesRexWPrefix(id)) { @@ -16864,6 +17259,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert((ins == INS_call) || (ins == INS_tail_i_jmp)); code = insCodeMR(ins); + code = AddX86PrefixIfNeeded(id, code, size); if (id->idIsDspReloc()) { @@ -17065,7 +17461,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeMR(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeMRreg(id, id->idReg1(), size, code); // set the W bit @@ -17174,7 +17570,12 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeMR(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); + if ((size == EA_2BYTE) && TakesRex2Prefix(id)) + { + // REX2 needs explicit pp prefix. + dst += emitOutputByte(dst, 0x66); + } code = insEncodeMRreg(id, code); mReg = id->idReg1(); rReg = id->idReg2(); @@ -17184,7 +17585,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeMI(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); assert((code & 0xC000) == 0); code |= 0xC000; @@ -17198,7 +17599,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeRM(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeRMreg(id, code); mReg = id->idReg2(); rReg = id->idReg1(); @@ -17354,13 +17755,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(!EncodedBySSE38orSSE3A(ins)); code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); regcode = insEncodeReg345(id, getSseShiftRegNumber(ins), size, &code); } else { code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); regcode = insEncodeReg345(id, id->idReg1(), size, &code); } @@ -17408,7 +17809,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); } @@ -17461,7 +17862,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } @@ -17485,7 +17886,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); } @@ -17497,7 +17898,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD_RRD: { code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); dst = emitOutputAM(dst, id, code); sz = emitSizeOfInsDsc_AMD(id); break; @@ -17576,7 +17977,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SWR_RRD_CNS: case IF_SRW_RRD_CNS: { - assert(IsAvx512OrPriorInstruction(ins)); + assert(IsAvx512OrPriorInstruction(ins) || (ins == INS_shld) || (ins == INS_shrd)); emitGetInsAmdCns(id, &cnsVal); dst = emitOutputSV(dst, id, insCodeMR(ins), &cnsVal); sz = emitSizeOfInsDsc_CNS(id); @@ -17596,13 +17997,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(!EncodedBySSE38orSSE3A(ins)); code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); regcode = insEncodeReg345(id, getSseShiftRegNumber(ins), size, &code); } else { code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); regcode = insEncodeReg345(id, id->idReg1(), size, &code); } @@ -17647,7 +18048,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); if (IsDstDstSrcAVXInstruction(ins)) { @@ -17670,7 +18071,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodableInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form @@ -17696,7 +18097,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodableInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeReg3456(id, id->idReg2(), size, code); emitGetInsCns(id, &cnsVal); @@ -17749,7 +18150,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -17825,13 +18226,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(!EncodedBySSE38orSSE3A(ins)); code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); regcode = insEncodeReg345(id, getSseShiftRegNumber(ins), size, &code); } else { code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); regcode = insEncodeReg345(id, id->idReg1(), size, &code); } @@ -17891,7 +18292,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -17931,7 +18332,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodableInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form @@ -17957,7 +18358,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodableInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); code = insEncodeReg3456(id, id->idReg2(), size, code); emitGetInsCns(id, &cnsVal); @@ -17997,7 +18398,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_MRD_OFF: { code = insCode(ins); - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -18033,7 +18434,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(id, code, size); + code = AddX86PrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index cc70f02ed3606..5f820c7c022c2 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -38,6 +38,18 @@ inline static bool isHighSimdReg(regNumber reg) #endif } +inline static bool isHighGPReg(regNumber reg) +{ +#ifdef TARGET_AMD64 + // TODO-apx: the definition here is incorrect, we will need to revisit this after we extend the register definition. + // for now, we can simply use REX2 as REX. + return ((reg >= REG_R8) && (reg <= REG_R15)); +#else + // X86 JIT operates in 32-bit mode and hence extended regs are not available. + return false; +#endif +} + /************************************************************************/ /* Routines that compute the size of / encode instructions */ /************************************************************************/ @@ -83,11 +95,13 @@ BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id); unsigned emitOutputRexOrSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code); unsigned emitGetRexPrefixSize(instruction ins); +unsigned emitGetRexPrefixSize(instrDesc* id, instruction ins); unsigned emitGetVexPrefixSize(instrDesc* id) const; unsigned emitGetEvexPrefixSize(instrDesc* id) const; unsigned emitGetPrefixSize(instrDesc* id, code_t code, bool includeRexPrefixSize); unsigned emitGetAdjustedSize(instrDesc* id, code_t code) const; +code_t emitExtractRex2Prefix(instruction ins, code_t& code) const; code_t emitExtractVexPrefix(instruction ins, code_t& code) const; code_t emitExtractEvexPrefix(instruction ins, code_t& code) const; @@ -119,17 +133,22 @@ static regNumber getBmiRegNumber(instruction ins); static regNumber getSseShiftRegNumber(instruction ins); bool HasVexEncoding(instruction ins) const; bool HasEvexEncoding(instruction ins) const; +bool HasRex2Encoding(instruction ins) const; bool IsVexEncodableInstruction(instruction ins) const; bool IsEvexEncodableInstruction(instruction ins) const; +bool IsRex2EncodableInstruction(instruction ins) const; +bool IsLegacyMap1(code_t code) const; bool IsVexOrEvexEncodableInstruction(instruction ins) const; code_t insEncodeMIreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code); code_t AddRexWPrefix(const instrDesc* id, code_t code); +code_t AddRex2WPrefix(const instrDesc* id, code_t code); code_t AddRexRPrefix(const instrDesc* id, code_t code); code_t AddRexXPrefix(const instrDesc* id, code_t code); code_t AddRexBPrefix(const instrDesc* id, code_t code); code_t AddRexPrefix(instruction ins, code_t code); +code_t AddRex2Prefix(instruction ins, code_t code); bool EncodedBySSE38orSSE3A(instruction ins) const; bool Is4ByteSSEInstruction(instruction ins) const; @@ -202,6 +221,32 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr insTupleType insTupleTypeInfo(instruction ins) const; +// 2-byte REX2 prefix starts with byte 0xD5 +#define REX2_PREFIX_MASK_2BYTE 0xFF0000000000ULL +#define REX2_PREFIX_CODE_2BYTE 0xD50000000000ULL + +bool TakesRex2Prefix(const instrDesc* id) const; +//------------------------------------------------------------------------ +// hasEvexPrefix: Returns true if the instruction encoding already +// contains Evex prefix. +// +// Arguments: +// code - opcode + prefixes bits at some stage of encoding. +// +// Returns: +// `true` if code has an Evex prefix. +// +bool hasRex2Prefix(code_t code) +{ +#ifdef TARGET_AMD64 + return (code & REX2_PREFIX_MASK_2BYTE) == REX2_PREFIX_CODE_2BYTE; +#else + return false; +#endif +} + +bool IsExtendedGPReg(regNumber reg) const; + //------------------------------------------------------------------------ // HasKMaskRegisterDest: Temporary check to identify instructions that can // be Evex encoded but require Opmask(KMask) register support. @@ -275,6 +320,18 @@ void SetUseEvexEncoding(bool value) useEvexEncodings = value; } +// Is Rex2 encoding supported. +bool useRex2Encodings; +bool UseRex2Encoding() const +{ + return useRex2Encodings; +} + +void SetUseRex2Encoding(bool value) +{ + useRex2Encodings = value; +} + //------------------------------------------------------------------------ // UseSimdEncoding: Returns true if either VEX or EVEX encoding is supported // contains Evex prefix. @@ -338,6 +395,78 @@ code_t AddSimdPrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) return code; } +//------------------------------------------------------------------------ +// AddX86PrefixIfNeeded: Add the correct instruction prefix if required. +// +// Arguments: +// ins - the instruction being encoded. +// code - opcode + prefixes bits at some stage of encoding. +// size - operand size +// +code_t AddX86PrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) +{ + // TODO-xarch-apx: + // consider refactor this part with AddSimdPrefixIfNeeded as a lot of functionality + // of these functions are overlapping. + + if (TakesEvexPrefix(id)) + { + return AddEvexPrefix(id, code, size); + } + + instruction ins = id->idIns(); + + if (TakesVexPrefix(ins)) + { + return AddVexPrefix(ins, code, size); + } + + // Based on how we labeled REX2 enabled instructions, we can confirm there will not be + // overlapping part between REX2 and VEX/EVEX, so order of the checks does not matter. + if (TakesRex2Prefix(id)) + { + return AddRex2Prefix(ins, code); + } + + return code; +} + +//------------------------------------------------------------------------ +// AddX86PrefixIfNeededAndNotPresent: Add the correct instruction prefix if required. +// +// Arguments: +// ins - the instruction being encoded. +// code - opcode + prefixes bits at some stage of encoding. +// size - operand size +// +code_t AddX86PrefixIfNeededAndNotPresent(const instrDesc* id, code_t code, emitAttr size) +{ + // TODO-xarch-apx: + // consider refactor this part with AddSimdPrefixIfNeeded as a lot of functionality + // of these functions are overlapping. + + if (TakesEvexPrefix(id)) + { + return !hasEvexPrefix(code) ? AddEvexPrefix(id, code, size) : code; + } + + instruction ins = id->idIns(); + + if (TakesVexPrefix(ins)) + { + return !hasVexPrefix(code) ? AddVexPrefix(ins, code, size) : code; + } + + // Based on how we labeled REX2 enabled instructions, we can confirm there will not be + // overlapping part between REX2 and VEX/EVEX, so order of the checks does not matter. + if (TakesRex2Prefix(id)) + { + return !hasRex2Prefix(code) ? AddRex2Prefix(ins, code) : code; + } + + return code; +} + //------------------------------------------------------------------------ // SetEvexBroadcastIfNeeded: set embedded broadcast if needed. // @@ -1089,6 +1218,7 @@ inline bool HasEmbeddedMask(const instrDesc* id) const } inline bool HasHighSIMDReg(const instrDesc* id) const; +inline bool HasExtendedGPReg(const instrDesc* id) const; inline bool HasMaskReg(const instrDesc* id) const; diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 5e27dcb9a2d86..5ec40ea333973 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -218,6 +218,9 @@ enum insFlags : uint64_t // EVEX feature: embedded broadcast INS_Flags_EmbeddedBroadcastSupported = 1ULL << 43, + // APX: REX2 prefix: + Encoding_REX2 = 1ULL << 44, + // TODO-Cleanup: Remove this flag and its usage from TARGET_XARCH INS_FLAGS_DONT_CARE = 0x00ULL, }; diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index bfb15df2cf397..fbc635ab5553b 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -52,71 +52,71 @@ // id nm um mr mi rm a4 rr tt flags INST5(invalid, "INVALID", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST5(push, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, INS_FLAGS_None) -INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, INS_FLAGS_None) +INST5(push, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, Encoding_REX2) +INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, Encoding_REX2) // Does not affect the stack tracking in the emitter -INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, INS_FLAGS_None) -INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, INS_FLAGS_None) +INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, INS_TT_NONE, Encoding_REX2) +INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, INS_TT_NONE, Encoding_REX2) -INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit) -INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF) -INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit) -INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF) +INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000040, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit |Encoding_REX2) +INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2) +INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x000048, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2) // Multi-byte opcodes without modrm are represented in mixed endian fashion. // See comment around quarter way through this file for more information. -INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, INS_FLAGS_None) +INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, Encoding_REX2) // id nm um mr mi rm a4 tt flags -INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) -INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) -INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) -INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) -INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) -INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) +INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) // Does not affect the stack tracking in the emitter -INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) +INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) -INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit) -INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit) -INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, INS_TT_NONE, INS_FLAGS_Has_Wbit) +INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(lea, "lea", IUM_WR, BAD_CODE, BAD_CODE, 0x00008D, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST4(lea, "lea", IUM_WR, BAD_CODE, BAD_CODE, 0x00008D, BAD_CODE, INS_TT_NONE, Encoding_REX2) // id nm um mr mi rm tt flags // Note that emitter has only partial support for BT. It can only emit the reg,reg form // and the registers need to be reversed to get the correct encoding. -INST3(bt, "bt", IUM_RD, 0x0F00A3, BAD_CODE, 0x0F00A3, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF) +INST3(bt, "bt", IUM_RD, 0x0F00A3, BAD_CODE, 0x0F00A3, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | Encoding_REX2) -INST3(bsf, "bsf", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BC, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF) -INST3(bsr, "bsr", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BD, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF) +INST3(bsf, "bsf", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BC, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF | Encoding_REX2) +INST3(bsr, "bsr", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BD, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF | Encoding_REX2) -INST3(movsx, "movsx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BE, INS_TT_NONE, INS_FLAGS_Has_Wbit) +INST3(movsx, "movsx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BE, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) #ifdef TARGET_AMD64 -INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE, 0x4800000063, INS_TT_NONE, INS_FLAGS_Has_Wbit) +INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE, 0x000063, INS_TT_NONE, REX_W1 | Encoding_REX2) #endif -INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, INS_TT_NONE, INS_FLAGS_Has_Wbit) - -INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF) -INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF) -INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF) -INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF) -INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF) -INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF) -INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF) -INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF) -INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF) -INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF) -INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF) -INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF) -INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF) -INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF) -INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) -INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) - -INST3(xchg, "xchg", IUM_RW, 0x000086, BAD_CODE, 0x000086, INS_TT_NONE, INS_FLAGS_Has_Wbit) +INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) + +INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, INS_TT_NONE, Reads_OF | Encoding_REX2) +INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, INS_TT_NONE, Reads_OF | Encoding_REX2) +INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, INS_TT_NONE, Reads_CF | Encoding_REX2) +INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, INS_TT_NONE, Reads_CF | Encoding_REX2) +INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, INS_TT_NONE, Reads_ZF | Encoding_REX2) +INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, INS_TT_NONE, Reads_ZF | Encoding_REX2) +INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) +INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) +INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, INS_TT_NONE, Reads_SF | Encoding_REX2) +INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, INS_TT_NONE, Reads_SF | Encoding_REX2) +INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, INS_TT_NONE, Reads_PF | Encoding_REX2) +INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, INS_TT_NONE, Reads_PF | Encoding_REX2) +INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) +INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) +INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) +INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) + +INST3(xchg, "xchg", IUM_RW, 0x000086, BAD_CODE, 0x000086, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit) // id nm um mr mi rm tt flags @@ -888,44 +888,44 @@ INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_ INST3(crc32, "crc32", IUM_RW, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF0), INS_TT_NONE, INS_FLAGS_None) // BMI1 -INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBC), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Count the Number of Trailing Zero Bits +INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBC), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | Encoding_REX2) // Count the Number of Trailing Zero Bits // LZCNT -INST3(lzcnt, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBD), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) +INST3(lzcnt, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBD), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | Encoding_REX2) // MOVBE INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), INS_TT_NONE, INS_FLAGS_None) // POPCNT -INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF) +INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | Encoding_REX2) // id nm um mr mi flags INST2(ret, "ret", IUM_RD, 0x0000C3, 0x0000C2, INS_TT_NONE, INS_FLAGS_None) INST2(loop, "loop", IUM_RD, BAD_CODE, 0x0000E2, INS_TT_NONE, INS_FLAGS_None) -INST2(call, "call", IUM_RD, 0x0010FF, 0x0000E8, INS_TT_NONE, INS_FLAGS_None) +INST2(call, "call", IUM_RD, 0x0010FF, 0x0000E8, INS_TT_NONE, Encoding_REX2) -INST2(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST2(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) -INST2(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST2(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) INST2(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) -INST2(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) -INST2(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) +INST2(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) INST2(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit) -INST2(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST2(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST2(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST2(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) // id nm um mr flags @@ -959,23 +959,23 @@ INST1(leave, "leave", IUM_RD, 0x0000C9, INST1(serialize, "serialize", IUM_RD, 0x0fe801, INS_TT_NONE, INS_FLAGS_None) -INST1(neg, "neg", IUM_RW, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST1(not, "not", IUM_RW, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit) +INST1(neg, "neg", IUM_RW, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST1(not, "not", IUM_RW, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2) INST1(cwde, "cwde", IUM_RD, 0x000098, INS_TT_NONE, INS_FLAGS_None) INST1(cdq, "cdq", IUM_RD, 0x000099, INS_TT_NONE, INS_FLAGS_None) -INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit) +INST1(idiv, "idiv", IUM_RD, 0x0038F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) INST1(imulEAX, "imul", IUM_RD, 0x0028F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit) -INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST1(div, "div", IUM_RD, 0x0030F6, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST1(mulEAX, "mul", IUM_RD, 0x0020F6, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) INST1(sahf, "sahf", IUM_RD, 0x00009E, INS_TT_NONE, Restore_SF_ZF_AF_PF_CF) -INST1(xadd, "xadd", IUM_RW, 0x0F00C0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) -INST1(cmpxchg, "cmpxchg", IUM_RW, 0x0F00B0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit) +INST1(xadd, "xadd", IUM_RW, 0x0F00C0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST1(cmpxchg, "cmpxchg", IUM_RW, 0x0F00B0, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST1(shld, "shld", IUM_RW, 0x0F00A4, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF) -INST1(shrd, "shrd", IUM_RW, 0x0F00AC, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF) +INST1(shld, "shld", IUM_RW, 0x0F00A4, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | Encoding_REX2) +INST1(shrd, "shrd", IUM_RW, 0x0F00AC, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | Encoding_REX2) // For RyuJIT/x86, we follow the x86 calling convention that requires // us to return floating point value on the x87 FP stack, so we need @@ -985,28 +985,28 @@ INST1(fld, "fld", IUM_WR, 0x0000D9, INST1(fstp, "fstp", IUM_WR, 0x0018D9, INS_TT_NONE, INS_FLAGS_x87Instr) #endif // TARGET_X86 -INST1(seto, "seto", IUM_WR, 0x0F0090, INS_TT_NONE, Reads_OF) -INST1(setno, "setno", IUM_WR, 0x0F0091, INS_TT_NONE, Reads_OF) -INST1(setb, "setb", IUM_WR, 0x0F0092, INS_TT_NONE, Reads_CF) -INST1(setae, "setae", IUM_WR, 0x0F0093, INS_TT_NONE, Reads_CF) -INST1(sete, "sete", IUM_WR, 0x0F0094, INS_TT_NONE, Reads_ZF) -INST1(setne, "setne", IUM_WR, 0x0F0095, INS_TT_NONE, Reads_ZF) -INST1(setbe, "setbe", IUM_WR, 0x0F0096, INS_TT_NONE, Reads_ZF | Reads_CF) -INST1(seta, "seta", IUM_WR, 0x0F0097, INS_TT_NONE, Reads_ZF | Reads_CF) -INST1(sets, "sets", IUM_WR, 0x0F0098, INS_TT_NONE, Reads_SF) -INST1(setns, "setns", IUM_WR, 0x0F0099, INS_TT_NONE, Reads_SF) -INST1(setp, "setp", IUM_WR, 0x0F009A, INS_TT_NONE, Reads_PF) -INST1(setnp, "setnp", IUM_WR, 0x0F009B, INS_TT_NONE, Reads_PF) -INST1(setl, "setl", IUM_WR, 0x0F009C, INS_TT_NONE, Reads_OF | Reads_SF) -INST1(setge, "setge", IUM_WR, 0x0F009D, INS_TT_NONE, Reads_OF | Reads_SF) -INST1(setle, "setle", IUM_WR, 0x0F009E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) -INST1(setg, "setg", IUM_WR, 0x0F009F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST1(seto, "seto", IUM_WR, 0x0F0090, INS_TT_NONE, Reads_OF | Encoding_REX2) +INST1(setno, "setno", IUM_WR, 0x0F0091, INS_TT_NONE, Reads_OF | Encoding_REX2) +INST1(setb, "setb", IUM_WR, 0x0F0092, INS_TT_NONE, Reads_CF | Encoding_REX2) +INST1(setae, "setae", IUM_WR, 0x0F0093, INS_TT_NONE, Reads_CF | Encoding_REX2) +INST1(sete, "sete", IUM_WR, 0x0F0094, INS_TT_NONE, Reads_ZF | Encoding_REX2) +INST1(setne, "setne", IUM_WR, 0x0F0095, INS_TT_NONE, Reads_ZF | Encoding_REX2) +INST1(setbe, "setbe", IUM_WR, 0x0F0096, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) +INST1(seta, "seta", IUM_WR, 0x0F0097, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) +INST1(sets, "sets", IUM_WR, 0x0F0098, INS_TT_NONE, Reads_SF | Encoding_REX2) +INST1(setns, "setns", IUM_WR, 0x0F0099, INS_TT_NONE, Reads_SF | Encoding_REX2) +INST1(setp, "setp", IUM_WR, 0x0F009A, INS_TT_NONE, Reads_PF | Encoding_REX2) +INST1(setnp, "setnp", IUM_WR, 0x0F009B, INS_TT_NONE, Reads_PF | Encoding_REX2) +INST1(setl, "setl", IUM_WR, 0x0F009C, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) +INST1(setge, "setge", IUM_WR, 0x0F009D, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) +INST1(setle, "setle", IUM_WR, 0x0F009E, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) +INST1(setg, "setg", IUM_WR, 0x0F009F, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) // Indirect jump used for tailcalls. We differentiate between func-internal // indirect jump (e.g. used for switch) and tailcall indirect jumps because the // x64 unwinder might require the latter to be rex.w prefixed. -INST1(tail_i_jmp, "tail.jmp", IUM_RD, 0x0020FF, INS_TT_NONE, INS_FLAGS_None) -INST1(i_jmp, "jmp", IUM_RD, 0x0020FF, INS_TT_NONE, INS_FLAGS_None) +INST1(tail_i_jmp, "tail.jmp", IUM_RD, 0x0020FF, INS_TT_NONE, Encoding_REX2) +INST1(i_jmp, "jmp", IUM_RD, 0x0020FF, INS_TT_NONE, Encoding_REX2) INST0(jmp, "jmp", IUM_RD, 0x0000EB, INS_TT_NONE, INS_FLAGS_None) INST0(jo, "jo", IUM_RD, 0x000070, INS_TT_NONE, Reads_OF) INST0(jno, "jno", IUM_RD, 0x000071, INS_TT_NONE, Reads_OF) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index bd0698d18b81d..4c5fc2e8d5328 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -368,6 +368,11 @@ RELEASE_CONFIG_INTEGER(EnableMultiRegLocals, "EnableMultiRegLocals", 1) // Disables inlining of all methods RELEASE_CONFIG_INTEGER(JitNoInline, "JitNoInline", 0) +#if defined(DEBUG) +CONFIG_INTEGER(JitStressRex2Encoding, "JitStressRex2Encoding", 0) // Enable rex2 encoding for legacy instructions. +CONFIG_INTEGER(JitBypassAPXCheck, "JitBypassAPXCheck", 0) // Bypass APX CPUID check. +#endif + // clang-format off #if defined(TARGET_AMD64) || defined(TARGET_X86)