Sungju's Slow Life

Personal journal


5-level paging in KVM

Latest kernels such as RHEL8 and above are supporting 5-level page table. You can check if this is available/enabled by looking at /boot/config-$(uname -r).

$ grep 5LEVEL /boot/config-$(uname -r)
CONFIG_X86_5LEVEL=y

If you don’t want to use 5-level page table, you can disable that by adding ‘no5lvl’ as a kernel parameter in grub file (/boot/grub2/grubenv) and reboot the system.

This 5 level paging is also available in virtual guest level as long as KVM supports it. It means it only works if CPU provides 5-level paging and host machine is RHEL8 or above.

KVM add this feature in kvm_set_caps() as long as physical CPU supports it.

void kvm_set_cpu_caps(void)
{
...
    /* Set LA57 based on hardware capability. */
    if (cpuid_ecx(7) & F(LA57))
        kvm_cpu_cap_set(X86_FEATURE_LA57);
...
}


F(LA57) === __feature_bit(X86_FEATURE_LA57)

/* From arch/x86/include/asm/cpufeatures.h */
#define X86_FEATURE_LA57        (16*32+16) /* 5-level page tables */

CPU features are added in kvm_set_cpu_caps() in RHEL8, but earlier, it was handled by different functions such as ‘__do_cpuid_func()’ or ‘__do_cpuid_ent’

/* RHEL8.2 or earlier */

static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
                  int *nent, int maxnent)
{
    int r;
    unsigned f_nx = is_efer_nx() ? F(NX) : 0;
#ifdef CONFIG_X86_64
    unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
                ? F(GBPAGES) : 0;
    unsigned f_lm = F(LM);
#else
    unsigned f_gbpages = 0;
    unsigned f_lm = 0;
#endif
    unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
    unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
    unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;

    /* cpuid 1.edx */
    const u32 kvm_cpuid_1_edx_x86_features =
        F(FPU) | F(VME) | F(DE) | F(PSE) |
        F(TSC) | F(MSR) | F(PAE) | F(MCE) |
        F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
        F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
        F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
        0 /* Reserved, DS, ACPI */ | F(MMX) |
        F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
        0 /* HTT, TM, Reserved, PBE */;
...

/* RHEL 7 */

static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 u32 index, int *nent, int maxnent)
{
    int r;
    unsigned f_nx = is_efer_nx() ? F(NX) : 0;
...
    /* cpuid 1.edx */
    const u32 kvm_cpuid_1_edx_x86_features =
        F(FPU) | F(VME) | F(DE) | F(PSE) |
        F(TSC) | F(MSR) | F(PAE) | F(MCE) |
        F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
        F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
        F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
        0 /* Reserved, DS, ACPI */ | F(MMX) |
        F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
        0 /* HTT, TM, Reserved, PBE */;
    /* cpuid 0x80000001.edx */
    const u32 kvm_cpuid_8000_0001_edx_x86_features =
        F(FPU) | F(VME) | F(DE) | F(PSE) |
        F(TSC) | F(MSR) | F(PAE) | F(MCE) |
        F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
        F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
        F(PAT) | F(PSE36) | 0 /* Reserved */ |

System is checking if kernel needs to handle 5 level paging by checking the feature and ‘no5lvl’ is not specified.

struct paging_config paging_prepare(void *rmode)
{
...
    /*
     * Check if LA57 is desired and supported.
     *
     * There are several parts to the check:
     *   - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
     *   - if user asked to disable 5-level paging: no5lvl in cmdline
     *   - if the machine supports 5-level paging:
     *     + CPUID leaf 7 is supported
     *     + the leaf has the feature bit set
     *
     * That's substitute for boot_cpu_has() in early boot code.
     */
    if (IS_ENABLED(CONFIG_X86_5LEVEL) &&
            !cmdline_find_option_bool("no5lvl") &&
            native_cpuid_eax(0) >= 7 &&
            (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
        paging_config.l5_required = 1;
    }
...
}  

CPUID assembly instruction returns processor identification and feature information in the EAX, EBX, ECX, and EDX registers.

Kernel is calling ‘cpuid_count()’ to get this information which is calling ‘cpuid’ as an inline assembly.

static inline void cpuid_count(u32 id, u32 count,
        u32 *a, u32 *b, u32 *c, u32 *d)
{
    asm volatile(".ifnc %%ebx,%3 ; movl  %%ebx,%3 ; .endif  \n\t"
             "cpuid                 \n\t"
             ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif  \n\t"
            : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
            : "a" (id), "c" (count)
    );
}

This instruction is mapped in kvm by using the below. When ‘cpuid’ is called, ‘kvm_emulate_cpuid()’ will be executed.

#define SVM_EXIT_REASONS \
...
    { SVM_EXIT_CPUID,       "cpuid" }, \
...



static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
...
    [SVM_EXIT_CPUID]            = kvm_emulate_cpuid,
...
};

This kvm_emulate_cpud() is getting CPU features by checking matching kvm_cpuid_entry2 list.

int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
    u32 eax, ebx, ecx, edx;

    if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
        return 1;

    eax = kvm_rax_read(vcpu);
    ecx = kvm_rcx_read(vcpu);
    kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
    kvm_rax_write(vcpu, eax);
    kvm_rbx_write(vcpu, ebx);
    kvm_rcx_write(vcpu, ecx);
    kvm_rdx_write(vcpu, edx);
    return kvm_skip_emulated_instruction(vcpu);
}

bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
           u32 *ecx, u32 *edx, bool exact_only)
{
    u32 orig_function = *eax, function = *eax, index = *ecx;
    struct kvm_cpuid_entry2 *entry;
    bool exact, used_max_basic = false;

    entry = kvm_find_cpuid_entry(vcpu, function, index);
...
}

struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                          u32 function, u32 index)
{
    return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
                 function, index);
}


static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
    struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
{
    struct kvm_cpuid_entry2 *e;
    int i;

    for (i = 0; i < nent; i++) {
        e = &entries[i];

        if (e->function == function &&
            (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index))
            return e;
    }

    return NULL;
}

/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                 struct kvm_cpuid *cpuid,
                 struct kvm_cpuid_entry __user *entries)
{
    int r, i;
    struct kvm_cpuid_entry *e = NULL;
    struct kvm_cpuid_entry2 *e2 = NULL;

    if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
        return -E2BIG;

    if (cpuid->nent) {
        e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent));
        if (IS_ERR(e))
            return PTR_ERR(e);

        e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT);
        if (!e2) {
            r = -ENOMEM;
            goto out_free_cpuid;
        }
    }
    for (i = 0; i < cpuid->nent; i++) {
        e2[i].function = e[i].function;
        e2[i].eax = e[i].eax;
...
}

long kvm_arch_vcpu_ioctl(struct file *filp,
             unsigned int ioctl, unsigned long arg)
{
...
    case KVM_SET_CPUID: {
        struct kvm_cpuid __user *cpuid_arg = argp;
        struct kvm_cpuid cpuid;

        r = -EFAULT;
        if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
            goto out;
        r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
        break;
    }
..
}

static long kvm_vcpu_ioctl(struct file *filp,
               unsigned int ioctl, unsigned long arg)
{
...
    default:
        r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
    }
...
}


Leave a comment

This site uses Akismet to reduce spam. Learn how your comment data is processed.