Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the CPUID and XSAVE logics for APX #104637

Merged
merged 18 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/design/features/xarch-apx.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# APX Integration in .NET
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doc can be added only once there is something worth talking about.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I can remove this for now and may add some specification to the design when REX2 or APX-EVEX changes coming in.


Let's keep documentation on APX integration and notes on things here. I will evolve this as necessary.
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ class AsmOffsets
// Debug build offsets
#if TARGET_AMD64
#if TARGET_UNIX
public const int SIZEOF__REGDISPLAY = 0x1a90;
public const int OFFSETOF__REGDISPLAY__SP = 0x1a78;
public const int OFFSETOF__REGDISPLAY__ControlPC = 0x1a80;
public const int SIZEOF__REGDISPLAY = 0x1b90;
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
public const int OFFSETOF__REGDISPLAY__SP = 0x1b78;
public const int OFFSETOF__REGDISPLAY__ControlPC = 0x1b80;
#else // TARGET_UNIX
public const int SIZEOF__REGDISPLAY = 0xbf0;
public const int OFFSETOF__REGDISPLAY__SP = 0xbd8;
Expand Down Expand Up @@ -68,9 +68,9 @@ class AsmOffsets
// Release build offsets
#if TARGET_AMD64
#if TARGET_UNIX
public const int SIZEOF__REGDISPLAY = 0x1a80;
public const int OFFSETOF__REGDISPLAY__SP = 0x1a70;
public const int OFFSETOF__REGDISPLAY__ControlPC = 0x1a78;
public const int SIZEOF__REGDISPLAY = 0x1b80;
public const int OFFSETOF__REGDISPLAY__SP = 0x1b70;
public const int OFFSETOF__REGDISPLAY__ControlPC = 0x1b78;
#else // TARGET_UNIX
public const int SIZEOF__REGDISPLAY = 0xbf0;
public const int OFFSETOF__REGDISPLAY__SP = 0xbd0;
Expand Down Expand Up @@ -120,7 +120,7 @@ class AsmOffsets

#if TARGET_AMD64
#if TARGET_UNIX
public const int SIZEOF__PAL_LIMITED_CONTEXT = 0xc20;
public const int SIZEOF__PAL_LIMITED_CONTEXT = 0xca0;
#else // TARGET_UNIX
public const int SIZEOF__PAL_LIMITED_CONTEXT = 0x4d0;
#endif // TARGET_UNIX
Expand Down
126 changes: 72 additions & 54 deletions src/coreclr/inc/corinfoinstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,33 +81,35 @@ enum CORINFO_InstructionSet
InstructionSet_VectorT128=36,
InstructionSet_VectorT256=37,
InstructionSet_VectorT512=38,
InstructionSet_X86Base_X64=39,
InstructionSet_SSE_X64=40,
InstructionSet_SSE2_X64=41,
InstructionSet_SSE3_X64=42,
InstructionSet_SSSE3_X64=43,
InstructionSet_SSE41_X64=44,
InstructionSet_SSE42_X64=45,
InstructionSet_AVX_X64=46,
InstructionSet_AVX2_X64=47,
InstructionSet_AES_X64=48,
InstructionSet_BMI1_X64=49,
InstructionSet_BMI2_X64=50,
InstructionSet_FMA_X64=51,
InstructionSet_LZCNT_X64=52,
InstructionSet_PCLMULQDQ_X64=53,
InstructionSet_POPCNT_X64=54,
InstructionSet_AVXVNNI_X64=55,
InstructionSet_MOVBE_X64=56,
InstructionSet_X86Serialize_X64=57,
InstructionSet_EVEX_X64=58,
InstructionSet_AVX512F_X64=59,
InstructionSet_AVX512BW_X64=60,
InstructionSet_AVX512CD_X64=61,
InstructionSet_AVX512DQ_X64=62,
InstructionSet_AVX512VBMI_X64=63,
InstructionSet_AVX10v1_X64=64,
InstructionSet_AVX10v1_V512_X64=65,
InstructionSet_APX=39,
InstructionSet_X86Base_X64=40,
InstructionSet_SSE_X64=41,
InstructionSet_SSE2_X64=42,
InstructionSet_SSE3_X64=43,
InstructionSet_SSSE3_X64=44,
InstructionSet_SSE41_X64=45,
InstructionSet_SSE42_X64=46,
InstructionSet_AVX_X64=47,
InstructionSet_AVX2_X64=48,
InstructionSet_AES_X64=49,
InstructionSet_BMI1_X64=50,
InstructionSet_BMI2_X64=51,
InstructionSet_FMA_X64=52,
InstructionSet_LZCNT_X64=53,
InstructionSet_PCLMULQDQ_X64=54,
InstructionSet_POPCNT_X64=55,
InstructionSet_AVXVNNI_X64=56,
InstructionSet_MOVBE_X64=57,
InstructionSet_X86Serialize_X64=58,
InstructionSet_EVEX_X64=59,
InstructionSet_AVX512F_X64=60,
InstructionSet_AVX512BW_X64=61,
InstructionSet_AVX512CD_X64=62,
InstructionSet_AVX512DQ_X64=63,
InstructionSet_AVX512VBMI_X64=64,
InstructionSet_AVX10v1_X64=65,
InstructionSet_AVX10v1_V512_X64=66,
InstructionSet_APX_X64=67,
#endif // TARGET_AMD64
#ifdef TARGET_X86
InstructionSet_X86Base=1,
Expand Down Expand Up @@ -148,33 +150,35 @@ enum CORINFO_InstructionSet
InstructionSet_VectorT128=36,
InstructionSet_VectorT256=37,
InstructionSet_VectorT512=38,
InstructionSet_X86Base_X64=39,
InstructionSet_SSE_X64=40,
InstructionSet_SSE2_X64=41,
InstructionSet_SSE3_X64=42,
InstructionSet_SSSE3_X64=43,
InstructionSet_SSE41_X64=44,
InstructionSet_SSE42_X64=45,
InstructionSet_AVX_X64=46,
InstructionSet_AVX2_X64=47,
InstructionSet_AES_X64=48,
InstructionSet_BMI1_X64=49,
InstructionSet_BMI2_X64=50,
InstructionSet_FMA_X64=51,
InstructionSet_LZCNT_X64=52,
InstructionSet_PCLMULQDQ_X64=53,
InstructionSet_POPCNT_X64=54,
InstructionSet_AVXVNNI_X64=55,
InstructionSet_MOVBE_X64=56,
InstructionSet_X86Serialize_X64=57,
InstructionSet_EVEX_X64=58,
InstructionSet_AVX512F_X64=59,
InstructionSet_AVX512BW_X64=60,
InstructionSet_AVX512CD_X64=61,
InstructionSet_AVX512DQ_X64=62,
InstructionSet_AVX512VBMI_X64=63,
InstructionSet_AVX10v1_X64=64,
InstructionSet_AVX10v1_V512_X64=65,
InstructionSet_APX=39,
InstructionSet_X86Base_X64=40,
InstructionSet_SSE_X64=41,
InstructionSet_SSE2_X64=42,
InstructionSet_SSE3_X64=43,
InstructionSet_SSSE3_X64=44,
InstructionSet_SSE41_X64=45,
InstructionSet_SSE42_X64=46,
InstructionSet_AVX_X64=47,
InstructionSet_AVX2_X64=48,
InstructionSet_AES_X64=49,
InstructionSet_BMI1_X64=50,
InstructionSet_BMI2_X64=51,
InstructionSet_FMA_X64=52,
InstructionSet_LZCNT_X64=53,
InstructionSet_PCLMULQDQ_X64=54,
InstructionSet_POPCNT_X64=55,
InstructionSet_AVXVNNI_X64=56,
InstructionSet_MOVBE_X64=57,
InstructionSet_X86Serialize_X64=58,
InstructionSet_EVEX_X64=59,
InstructionSet_AVX512F_X64=60,
InstructionSet_AVX512BW_X64=61,
InstructionSet_AVX512CD_X64=62,
InstructionSet_AVX512DQ_X64=63,
InstructionSet_AVX512VBMI_X64=64,
InstructionSet_AVX10v1_X64=65,
InstructionSet_AVX10v1_V512_X64=66,
InstructionSet_APX_X64=67,
#endif // TARGET_X86

};
Expand Down Expand Up @@ -344,6 +348,8 @@ struct CORINFO_InstructionSetFlags
AddInstructionSet(InstructionSet_AVX10v1_X64);
if (HasInstructionSet(InstructionSet_AVX10v1_V512))
AddInstructionSet(InstructionSet_AVX10v1_V512_X64);
if (HasInstructionSet(InstructionSet_APX))
AddInstructionSet(InstructionSet_APX_X64);
#endif // TARGET_AMD64
#ifdef TARGET_X86
#endif // TARGET_X86
Expand Down Expand Up @@ -532,6 +538,10 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_AVX10v1_V512);
if (resultflags.HasInstructionSet(InstructionSet_AVX10v1_V512_X64) && !resultflags.HasInstructionSet(InstructionSet_AVX10v1_V512))
resultflags.RemoveInstructionSet(InstructionSet_AVX10v1_V512_X64);
if (resultflags.HasInstructionSet(InstructionSet_APX) && !resultflags.HasInstructionSet(InstructionSet_APX_X64))
resultflags.RemoveInstructionSet(InstructionSet_APX);
if (resultflags.HasInstructionSet(InstructionSet_APX_X64) && !resultflags.HasInstructionSet(InstructionSet_APX))
resultflags.RemoveInstructionSet(InstructionSet_APX_X64);
if (resultflags.HasInstructionSet(InstructionSet_SSE) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_SSE);
if (resultflags.HasInstructionSet(InstructionSet_SSE2) && !resultflags.HasInstructionSet(InstructionSet_SSE))
Expand Down Expand Up @@ -940,6 +950,10 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "VectorT256";
case InstructionSet_VectorT512 :
return "VectorT512";
case InstructionSet_APX :
return "APX";
case InstructionSet_APX_X64 :
return "APX_X64";
#endif // TARGET_AMD64
#ifdef TARGET_X86
case InstructionSet_X86Base :
Expand Down Expand Up @@ -1018,6 +1032,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "VectorT256";
case InstructionSet_VectorT512 :
return "VectorT512";
case InstructionSet_APX :
return "APX";
#endif // TARGET_X86

default:
Expand Down Expand Up @@ -1088,6 +1104,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_VectorT128: return InstructionSet_VectorT128;
case READYTORUN_INSTRUCTION_VectorT256: return InstructionSet_VectorT256;
case READYTORUN_INSTRUCTION_VectorT512: return InstructionSet_VectorT512;
case READYTORUN_INSTRUCTION_Apx: return InstructionSet_APX;
#endif // TARGET_AMD64
#ifdef TARGET_X86
case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base;
Expand Down Expand Up @@ -1125,6 +1142,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_VectorT128: return InstructionSet_VectorT128;
case READYTORUN_INSTRUCTION_VectorT256: return InstructionSet_VectorT256;
case READYTORUN_INSTRUCTION_VectorT512: return InstructionSet_VectorT512;
case READYTORUN_INSTRUCTION_Apx: return InstructionSet_APX;
#endif // TARGET_X86

default:
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/inc/jiteeversionguid.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ typedef const GUID *LPCGUID;
#define GUID_DEFINED
#endif // !GUID_DEFINED

constexpr GUID JITEEVersionIdentifier = { /* 04021b93-e969-41ed-96cd-4c583673b9ab */
0x04021b93,
0xe969,
0x41ed,
{0x96, 0xcd, 0x4c, 0x58, 0x36, 0x73, 0xb9, 0xab}
constexpr GUID JITEEVersionIdentifier = { /* 381fc250-b8f3-4cee-834e-b0bc682a09f2 */
0x381fc250,
0xb8f3,
0x4cee,
{0x83, 0x4e, 0xb0, 0xbc, 0x68, 0x2a, 0x09, 0xf2}
};

//////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/inc/readytoruninstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ enum ReadyToRunInstructionSet
READYTORUN_INSTRUCTION_Avx10v1=44,
READYTORUN_INSTRUCTION_Avx10v1_V512=46,
READYTORUN_INSTRUCTION_EVEX=47,
READYTORUN_INSTRUCTION_Apx=48,

};

Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,7 @@ static const HWIntrinsicIsaRange hwintrinsicIsaRangeArray[] = {
{ NI_Illegal, NI_Illegal }, // VectorT128
{ NI_Illegal, NI_Illegal }, // VectorT256
{ NI_Illegal, NI_Illegal }, // VectorT512
{ NI_Illegal, NI_Illegal }, // APX
{ FIRST_NI_X86Base_X64, LAST_NI_X86Base_X64 },
{ FIRST_NI_SSE_X64, LAST_NI_SSE_X64 },
{ FIRST_NI_SSE2_X64, LAST_NI_SSE2_X64 },
Expand Down
10 changes: 7 additions & 3 deletions src/coreclr/nativeaot/Runtime/windows/PalRedhawkMinWin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@
#define REDHAWK_PALEXPORT extern "C"
#define REDHAWK_PALAPI __stdcall

#ifndef XSTATE_MASK_APX
#define XSTATE_MASK_APX (0x80000)
#endif // XSTATE_MASK_APX

// Index for the fiber local storage of the attached thread pointer
static uint32_t g_flsIndex = FLS_OUT_OF_INDEXES;

Expand Down Expand Up @@ -541,7 +545,7 @@ REDHAWK_PALEXPORT CONTEXT* PalAllocateCompleteOSContext(_Out_ uint8_t** contextB
#endif //TARGET_X86

#if defined(TARGET_X86) || defined(TARGET_AMD64)
const DWORD64 xStateFeatureMask = XSTATE_MASK_AVX | XSTATE_MASK_AVX512;
const DWORD64 xStateFeatureMask = XSTATE_MASK_AVX | XSTATE_MASK_AVX512 | XSTATE_MASK_APX;
const ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_MPX | xStateFeatureMask;
#elif defined(TARGET_ARM64)
const DWORD64 xStateFeatureMask = XSTATE_MASK_ARM64_SVE;
Expand Down Expand Up @@ -632,9 +636,9 @@ REDHAWK_PALEXPORT _Success_(return) bool REDHAWK_PALAPI PalGetCompleteThreadCont
// This should not normally fail.
// The system silently ignores any feature specified in the FeatureMask which is not enabled on the processor.
#if defined(TARGET_X86) || defined(TARGET_AMD64)
if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX | XSTATE_MASK_AVX512))
if (!SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX | XSTATE_MASK_AVX512 | XSTATE_MASK_APX))
{
_ASSERTE(!"Could not apply XSTATE_MASK_AVX | XSTATE_MASK_AVX512");
_ASSERTE(!"Could not apply XSTATE_MASK_AVX | XSTATE_MASK_AVX512 | XSTATE_MASK_APX");
return FALSE;
}
#elif defined(TARGET_ARM64)
Expand Down
23 changes: 23 additions & 0 deletions src/coreclr/pal/inc/pal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1374,12 +1374,14 @@ typedef struct _KNONVOLATILE_CONTEXT_POINTERS {
#define XSTATE_AVX512_KMASK (5)
#define XSTATE_AVX512_ZMM_H (6)
#define XSTATE_AVX512_ZMM (7)
#define XSTATE_APX (19)

#define XSTATE_MASK_GSSE (UI64(1) << (XSTATE_GSSE))
#define XSTATE_MASK_AVX (XSTATE_MASK_GSSE)
#define XSTATE_MASK_AVX512 ((UI64(1) << (XSTATE_AVX512_KMASK)) | \
(UI64(1) << (XSTATE_AVX512_ZMM_H)) | \
(UI64(1) << (XSTATE_AVX512_ZMM)))
#define XSTATE_MASK_APX (UI64(1) << (XSTATE_APX))

typedef struct DECLSPEC_ALIGN(16) _M128A {
ULONGLONG Low;
Expand Down Expand Up @@ -1616,6 +1618,27 @@ typedef struct DECLSPEC_ALIGN(16) _CONTEXT {
M512 Zmm30;
M512 Zmm31;
};

struct
{
DWORD64 Egpr16;
DWORD64 Egpr17;
DWORD64 Egpr18;
DWORD64 Egpr19;
DWORD64 Egpr20;
DWORD64 Egpr21;
DWORD64 Egpr22;
DWORD64 Egpr23;
DWORD64 Egpr24;
DWORD64 Egpr25;
DWORD64 Egpr26;
DWORD64 Egpr27;
DWORD64 Egpr28;
DWORD64 Egpr29;
DWORD64 Egpr30;
DWORD64 Egpr31;
};

} CONTEXT, *PCONTEXT, *LPCONTEXT;

//
Expand Down
5 changes: 4 additions & 1 deletion src/coreclr/pal/src/arch/amd64/asmconstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
#define XSTATE_AVX512_KMASK (5)
#define XSTATE_AVX512_ZMM_H (6)
#define XSTATE_AVX512_ZMM (7)
#define XSTATE_APX (19)

#define XSTATE_MASK_GSSE (1 << (XSTATE_GSSE))
#define XSTATE_MASK_AVX (XSTATE_MASK_GSSE)
#define XSTATE_MASK_AVX512 ((1 << (XSTATE_AVX512_KMASK)) | \
(1 << (XSTATE_AVX512_ZMM_H)) | \
(1 << (XSTATE_AVX512_ZMM)))
#define XSTATE_MASK_APX (1 << (XSTATE_APX))

// The arch bit is normally set in the flag constants below. Since this is already arch-specific code and the arch bit is not
// relevant, the arch bit is excluded from the flag constants below for simpler tests.
Expand Down Expand Up @@ -91,7 +93,8 @@
#define CONTEXT_KMask0 CONTEXT_Ymm0H+(16*16)
#define CONTEXT_Zmm0H CONTEXT_KMask0+(8*8)
#define CONTEXT_Zmm16 CONTEXT_Zmm0H+(32*16)
#define CONTEXT_Size CONTEXT_Zmm16+(64*16)
#define CONTEXT_Egpr CONTEXT_Zmm16+(16*8)
#define CONTEXT_Size CONTEXT_Egpr+(64*16)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look right. It seems it should be

#define CONTEXT_Egpr CONTEXT_Zmm16+(64*16)
#define CONTEXT_Size CONTEXT_Egpr+(16*8)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this comment is still pending.

I'd agree with the janvorli's assessment here. Each new define is previousDefine+(SizeOfPreviousDefine) and _Zmm16 should have a size of 64*16, while _Egpr should have a size of 8*16 (typically SizeOfPreviousDefine is built up as SizePerItem*NumberOfItems, so 64*16 because each register is 64-bytes and there are 16 of them, similarly 8*16 because each register is 8-bytes and there are 16 of them).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing out and the explanation!

sorry for the delay, we had a holiday yesterday.


#else // HOST_64BIT

Expand Down
20 changes: 20 additions & 0 deletions src/coreclr/pal/src/arch/amd64/context2.S
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,26 @@ LOCAL_LABEL(Done_Restore_CONTEXT_FLOATING_POINT):
kmovq k6, qword ptr [rdi + (CONTEXT_KMask0 + 6 * 8)]
kmovq k7, qword ptr [rdi + (CONTEXT_KMask0 + 7 * 8)]

test BYTE PTR [rdi + CONTEXT_XStateFeaturesMask], XSTATE_MASK_APX
je LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE)

mov r16, qword ptr [rdi + CONTEXT_Egpr + 0 * 8]
mov r17, qword ptr [rdi + CONTEXT_Egpr + 1 * 8]
mov r18, qword ptr [rdi + CONTEXT_Egpr + 2 * 8]
mov r19, qword ptr [rdi + CONTEXT_Egpr + 3 * 8]
mov r20, qword ptr [rdi + CONTEXT_Egpr + 4 * 8]
mov r21, qword ptr [rdi + CONTEXT_Egpr + 5 * 8]
mov r22, qword ptr [rdi + CONTEXT_Egpr + 6 * 8]
mov r23, qword ptr [rdi + CONTEXT_Egpr + 7 * 8]
mov r24, qword ptr [rdi + CONTEXT_Egpr + 8 * 8]
mov r25, qword ptr [rdi + CONTEXT_Egpr + 9 * 8]
mov r26, qword ptr [rdi + CONTEXT_Egpr + 10 * 8]
mov r27, qword ptr [rdi + CONTEXT_Egpr + 11 * 8]
mov r28, qword ptr [rdi + CONTEXT_Egpr + 12 * 8]
mov r29, qword ptr [rdi + CONTEXT_Egpr + 13 * 8]
mov r30, qword ptr [rdi + CONTEXT_Egpr + 14 * 8]
mov r31, qword ptr [rdi + CONTEXT_Egpr + 15 * 8]
tannergooding marked this conversation as resolved.
Show resolved Hide resolved

LOCAL_LABEL(Done_Restore_CONTEXT_XSTATE):

test BYTE PTR [rdi + CONTEXT_ContextFlags], CONTEXT_CONTROL
Expand Down
Loading
Loading