Latest kernels such as RHEL8 and above are supporting 5-level page table. You can check if this is available/enabled by looking at /boot/config-$(uname -r).
$ grep 5LEVEL /boot/config-$(uname -r)
CONFIG_X86_5LEVEL=y
If you don’t want to use 5-level page table, you can disable that by adding ‘no5lvl’ as a kernel parameter in grub file (/boot/grub2/grubenv) and reboot the system.
This 5 level paging is also available in virtual guest level as long as KVM supports it. It means it only works if CPU provides 5-level paging and host machine is RHEL8 or above.
KVM add this feature in kvm_set_caps() as long as physical CPU supports it.
void kvm_set_cpu_caps(void)
{
...
/* Set LA57 based on hardware capability. */
if (cpuid_ecx(7) & F(LA57))
kvm_cpu_cap_set(X86_FEATURE_LA57);
...
}
F(LA57) === __feature_bit(X86_FEATURE_LA57)
/* From arch/x86/include/asm/cpufeatures.h */
#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
CPU features are added in kvm_set_cpu_caps() in RHEL8, but earlier, it was handled by different functions such as ‘__do_cpuid_func()’ or ‘__do_cpuid_ent’
/* RHEL8.2 or earlier */
static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
int *nent, int maxnent)
{
int r;
unsigned f_nx = is_efer_nx() ? F(NX) : 0;
#ifdef CONFIG_X86_64
unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
? F(GBPAGES) : 0;
unsigned f_lm = F(LM);
#else
unsigned f_gbpages = 0;
unsigned f_lm = 0;
#endif
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
F(FPU) | F(VME) | F(DE) | F(PSE) |
F(TSC) | F(MSR) | F(PAE) | F(MCE) |
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
0 /* Reserved, DS, ACPI */ | F(MMX) |
F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
0 /* HTT, TM, Reserved, PBE */;
...
/* RHEL 7 */
static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index, int *nent, int maxnent)
{
int r;
unsigned f_nx = is_efer_nx() ? F(NX) : 0;
...
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
F(FPU) | F(VME) | F(DE) | F(PSE) |
F(TSC) | F(MSR) | F(PAE) | F(MCE) |
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
0 /* Reserved, DS, ACPI */ | F(MMX) |
F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
0 /* HTT, TM, Reserved, PBE */;
/* cpuid 0x80000001.edx */
const u32 kvm_cpuid_8000_0001_edx_x86_features =
F(FPU) | F(VME) | F(DE) | F(PSE) |
F(TSC) | F(MSR) | F(PAE) | F(MCE) |
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
F(PAT) | F(PSE36) | 0 /* Reserved */ |
System is checking if kernel needs to handle 5 level paging by checking the feature and ‘no5lvl’ is not specified.
struct paging_config paging_prepare(void *rmode)
{
...
/*
* Check if LA57 is desired and supported.
*
* There are several parts to the check:
* - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
* - if user asked to disable 5-level paging: no5lvl in cmdline
* - if the machine supports 5-level paging:
* + CPUID leaf 7 is supported
* + the leaf has the feature bit set
*
* That's substitute for boot_cpu_has() in early boot code.
*/
if (IS_ENABLED(CONFIG_X86_5LEVEL) &&
!cmdline_find_option_bool("no5lvl") &&
native_cpuid_eax(0) >= 7 &&
(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
paging_config.l5_required = 1;
}
...
}
CPUID assembly instruction returns processor identification and feature information in the EAX, EBX, ECX, and EDX registers.
Kernel is calling ‘cpuid_count()’ to get this information which is calling ‘cpuid’ as an inline assembly.
static inline void cpuid_count(u32 id, u32 count,
u32 *a, u32 *b, u32 *c, u32 *d)
{
asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
"cpuid \n\t"
".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
: "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
: "a" (id), "c" (count)
);
}
This instruction is mapped in kvm by using the below. When ‘cpuid’ is called, ‘kvm_emulate_cpuid()’ will be executed.
#define SVM_EXIT_REASONS \
...
{ SVM_EXIT_CPUID, "cpuid" }, \
...
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
...
[SVM_EXIT_CPUID] = kvm_emulate_cpuid,
...
};
This kvm_emulate_cpud() is getting CPU features by checking matching kvm_cpuid_entry2 list.
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
u32 eax, ebx, ecx, edx;
if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
return 1;
eax = kvm_rax_read(vcpu);
ecx = kvm_rcx_read(vcpu);
kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
kvm_rax_write(vcpu, eax);
kvm_rbx_write(vcpu, ebx);
kvm_rcx_write(vcpu, ecx);
kvm_rdx_write(vcpu, edx);
return kvm_skip_emulated_instruction(vcpu);
}
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool exact_only)
{
u32 orig_function = *eax, function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *entry;
bool exact, used_max_basic = false;
entry = kvm_find_cpuid_entry(vcpu, function, index);
...
}
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index)
{
return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
function, index);
}
static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
{
struct kvm_cpuid_entry2 *e;
int i;
for (i = 0; i < nent; i++) {
e = &entries[i];
if (e->function == function &&
(!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index))
return e;
}
return NULL;
}
/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid,
struct kvm_cpuid_entry __user *entries)
{
int r, i;
struct kvm_cpuid_entry *e = NULL;
struct kvm_cpuid_entry2 *e2 = NULL;
if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
return -E2BIG;
if (cpuid->nent) {
e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent));
if (IS_ERR(e))
return PTR_ERR(e);
e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT);
if (!e2) {
r = -ENOMEM;
goto out_free_cpuid;
}
}
for (i = 0; i < cpuid->nent; i++) {
e2[i].function = e[i].function;
e2[i].eax = e[i].eax;
...
}
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
...
case KVM_SET_CPUID: {
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
break;
}
..
}
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
...
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}
...
}
Leave a Reply