How intel_idle works

When the system is in IDLE state which means nothing to run and swapper is running, it calls cpuidle_idle_call() like shown in the below.

    [exception RIP: cpuidle_enter_state+0x57]
    RIP: ffffffff833c1a67  RSP: ffff8917ce84be60  RFLAGS: 00000202
    RAX: 0001332321398719  RBX: ffffffff82eaf37b  RCX: 0000000000000018
    RDX: 0000000225c17d03  RSI: ffff8917ce84bfd8  RDI: 0001332321398719
    RBP: ffff8917ce84be88   R8: 0000000000000130   R9: 0000000000000018
    R10: 00000000000000c3  R11: 0000000000000400  R12: 00000001141c5477
    R13: 00013323212a4180  R14: ffff89977fd5ac80  R15: ffffffff82ecaa77
    ORIG_RAX: ffffffffffffff10  CS: 0010  SS: 0018
#19 [ffff8917ce84be90] cpuidle_idle_call at ffffffff833c1bbe
#20 [ffff8917ce84bed0] arch_cpu_idle at ffffffff82e37c6e
#21 [ffff8917ce84bee0] cpu_startup_entry at ffffffff82f0159a
#22 [ffff8917ce84bf28] start_secondary at ffffffff82e5a0c7
#23 [ffff8917ce84bf50] start_cpu at ffffffff82e000d5

This cpuidle_idle_call() is called from arch_cpu_idle().

/*
 * Called from the generic idle code.
 */
void arch_cpu_idle(void)
{
    if (cpuidle_idle_call()) {
        /* The cpuidle call failed, fallback to a simpler idle */
        spec_ctrl_ibrs_off();
        x86_idle();
        spec_ctrl_ibrs_on();
    } else
        local_irq_enable();
}

cpuidle_idle_call() is the main idle loop which is checking idle driver and do further steps if the driver is installed and active. It can change the state by calling ‘cpuidle_enter_state’.

/**
 * cpuidle_idle_call - the main idle loop
 *
 * NOTE: no locks or semaphores should be used here
 * return non-zero on failure
 */
int cpuidle_idle_call(void)
{
    struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
    struct cpuidle_driver *drv;
    int next_state, entered_state;

    if (off)
        return -ENODEV;

    if (!initialized)
        return -ENODEV;

    /* check if the device is ready */
    if (!dev || !dev->enabled)
        return -EBUSY;

    drv = cpuidle_get_cpu_driver(dev);

    /* ask the governor for the next state */
    next_state = cpuidle_curr_governor->select(drv, dev);  <-- check next state
    if (need_resched()) {
        dev->last_residency = 0;
        /* give the governor an opportunity to reflect on the outcome */
        if (cpuidle_curr_governor->reflect)
            cpuidle_curr_governor->reflect(dev, next_state);
        local_irq_enable();
        return 0;
    }

    trace_cpu_idle_rcuidle(next_state, dev->cpu);

    if (drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP)
        clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
                   &dev->cpu);

    if (cpuidle_state_is_coupled(dev, drv, next_state))
        entered_state = cpuidle_enter_state_coupled(dev, drv,
                                next_state);
    else
        entered_state = cpuidle_enter_state(dev, drv, next_state);		<--- enter

    if (drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP)
        clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
                   &dev->cpu);

    trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);

    /* give the governor an opportunity to reflect on the outcome */
    if (cpuidle_curr_governor->reflect)
        cpuidle_curr_governor->reflect(dev, entered_state);

    return 0;
}

cpuidle_enter_state() is actually enter to a state by calling driver’s enter function.

/**
 * cpuidle_enter_state - enter the state and update stats
 * @dev: cpuidle device for this cpu
 * @drv: cpuidle driver for this cpu
 * @next_state: index into drv->states of the state to enter
 */
int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
            int index)
{
    int entered_state;

    struct cpuidle_state *target_state = &drv->states[index];
    ktime_t time_start, time_end;
    s64 diff;

    time_start = ktime_get();

    entered_state = target_state->enter(dev, drv, index);		<-- driver's enter function.

    time_end = ktime_get();

    local_irq_enable();

    diff = ktime_to_us(ktime_sub(time_end, time_start));
    if (diff > INT_MAX)
        diff = INT_MAX;

    dev->last_residency = (int) diff;

    if (entered_state >= 0) {
        /* Update cpuidle counters */
        /* This can be moved to within driver enter routine
         * but that results in multiple copies of same code.
         */
        dev->states_usage[entered_state].time += dev->last_residency;
        dev->states_usage[entered_state].usage++;
    } else {
        dev->last_residency = 0;
    }

    return entered_state;
}

This driver states are set by calling ‘intel_idle_state_table_update() which is called by the below sequence. If idle=xxx was provided as kernel parameter, the cstates update won’t be called as intel_idle_init will just return.

...
    } else if (!strcmp(str, "nomwait")) {
        /*
         * If the boot option of "idle=nomwait" is added,
         * it means that mwait will be disabled for CPU C2/C3
         * states. In such case it won't touch the variable
         * of boot_option_idle_override.
         */
        boot_option_idle_override = IDLE_NOMWAIT;
        
        


static int __init intel_idle_init(void)
{
    int retval, i;

    /* Do not load intel_idle at all for now if idle= is passed */
    if (boot_option_idle_override != IDLE_NO_OVERRIDE)
        return -ENODEV;
...

Call sequence:
intel_idle_init -> intel_idle_cpuidle_driver_init -> intel_idle_state_table_update -> sklh_idle_state_table_update

static void intel_idle_state_table_update(void)
{
    switch (boot_cpu_data.x86_model) {

    case INTEL_FAM6_IVYBRIDGE_X:
        ivt_idle_state_table_update();
        break;
    case INTEL_FAM6_ATOM_GOLDMONT:
    case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
        bxt_idle_state_table_update();
        break;
    case INTEL_FAM6_SKYLAKE_DESKTOP:
        sklh_idle_state_table_update();
        break;
    }
}

It is setting each CPU’s cstate table. Below is the function for skylake.

/*
 * sklh_idle_state_table_update(void)
 *
 * On SKL-H (model 0x5e) disable C8 and C9 if:
 * C10 is enabled and SGX disabled
 */
static void sklh_idle_state_table_update(void)
{
    unsigned long long msr;
    unsigned int eax, ebx, ecx, edx;


    /* if PC10 disabled via cmdline intel_idle.max_cstate=7 or shallower */
    if (max_cstate <= 7)
        return;

    /* if PC10 not present in CPUID.MWAIT.EDX */
    if ((mwait_substates & (0xF << 28)) == 0)
        return;

    rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr);

    /* PC10 is not enabled in PKG C-state limit */
    if ((msr & 0xF) != 8)
        return;

    ecx = 0;
    cpuid(7, &eax, &ebx, &ecx, &edx);

    /* if SGX is present */
    if (ebx & (1 << 2)) {

        rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);

        /* if SGX is enabled */
        if (msr & (1 << 18))
            return;
    }

    skl_cstates[5].disabled = 1;    /* C8-SKL */
    skl_cstates[6].disabled = 1;    /* C9-SKL */
}

As the result, the global variable ‘skl_cstates’ will be updated which has the below value by default.

static struct cpuidle_state skl_cstates[] = {
    {
        .name = "C1-SKL",
        .desc = "MWAIT 0x00",
        .flags = MWAIT2flg(0x00),
        .exit_latency = 2,
        .target_residency = 2,
        .enter = &intel_idle },
    {
        .name = "C1E-SKL",
        .desc = "MWAIT 0x01",
        .flags = MWAIT2flg(0x01),
        .exit_latency = 10,
        .target_residency = 20,
        .enter = &intel_idle },
    {
        .name = "C3-SKL",
        .desc = "MWAIT 0x10",
        .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
        .exit_latency = 70,
        .target_residency = 100,
        .enter = &intel_idle },
    {
        .name = "C6-SKL",
        .desc = "MWAIT 0x20",
        .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
        .exit_latency = 85,
        .target_residency = 200,
        .enter = &intel_idle },
    {
        .name = "C7s-SKL",
        .desc = "MWAIT 0x33",
        .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED,
        .exit_latency = 124,
        .target_residency = 800,
        .enter = &intel_idle },
    {
        .name = "C8-SKL",
        .desc = "MWAIT 0x40",
        .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
        .exit_latency = 200,
        .target_residency = 800,
        .enter = &intel_idle },
    {
        .name = "C9-SKL",
        .desc = "MWAIT 0x50",
        .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
        .exit_latency = 480,
        .target_residency = 5000,
        .enter = &intel_idle, },
    {
        .name = "C10-SKL",
        .desc = "MWAIT 0x60",
        .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
        .exit_latency = 890,
        .target_residency = 5000,
        .enter = &intel_idle },
    {
        .enter = NULL }
};

With the above settings, when ‘enter’ is called, the actual function ‘intel_idle()’ will be called which will be in idle state whilst try not to consume too much CPU powers by using ‘monitor/mwait’.

/**
 * intel_idle
 * @dev: cpuidle_device
 * @drv: cpuidle driver
 * @index: index of cpuidle state
 *
 * Must be called under local_irq_disable().
 */
static __cpuidle int intel_idle(struct cpuidle_device *dev,
                struct cpuidle_driver *drv, int index)
{
    unsigned long ecx = 1; /* break on interrupt flag */
    struct cpuidle_state *state = &drv->states[index];
    unsigned long eax = flg2MWAIT(state->flags);
    unsigned int cstate;
    int cpu = smp_processor_id();

    cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;

    /*
     * leave_mm() to avoid costly and often unnecessary wakeups
     * for flushing the user TLB's associated with the active mm.
     */
    if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
        leave_mm(cpu);

    if (!(lapic_timer_reliable_states & (1 << (cstate))))
        clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);

    mwait_idle_with_hints(eax, ecx);				<-- Actual idle with optimised monitor/mwait

    if (!(lapic_timer_reliable_states & (1 << (cstate))))
        clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);

    return index;
}

Below is the actual monitor/mwait routine that is called from intel_idle().

/*
 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
 * which can obviate IPI to trigger checking of need_resched.
 * We execute MONITOR against need_resched and enter optimized wait state
 * through MWAIT. Whenever someone changes need_resched, we would be woken
 * up from MWAIT (without an IPI).
 *
 * New with Core Duo processors, MWAIT can take some hints based on CPU
 * capability.
 */
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
    if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
        if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
            clflush((void *)&current_thread_info()->flags);

        /*
         * IRQs must be disabled here and nmi uses the
         * save_paranoid model which always enables ibrs on
         * exception entry before any indirect jump can run.
         */
        spec_ctrl_ibrs_off();
        __monitor((void *)&current_thread_info()->flags, 0, 0);
        if (!need_resched())
            __mwait(eax, ecx);
        spec_ctrl_ibrs_on();
    }
    __current_clr_polling();
}

So, from the above we can see that mwait is using CPU specific cstates table which will let the kernel know what is the best way to wait in mwait. So, optimised for each CPU type. If we are using ‘idle=nomwait’, none of the above will be executed and will do use normal loop which will be less efficient.

If cpuidle_idle_call() is failed, it will call default_idle() which is calling safe_halt(). safe_halt is native_safe_halt() in x86.

void default_idle(void)
{
    trace_cpu_idle_rcuidle(1, smp_processor_id());
    safe_halt();
    trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
}

safe_halt == native_safe_halt

This native_safe_halt() is calling very simple ‘hlt’ instruction.

static inline __cpuidle void native_safe_halt(void)
{
    mds_idle_clear_cpu_buffers();
    asm volatile("sti; hlt": : :"memory");
}

‘hlt’ is described something like below.

Stops instruction execution and places the processor in a HALT state. An enabled interrupt (including NMI and SMI), a debug exception, the BINIT# signal, the INIT# signal, or the RESET# signal will resume execution. If an interrupt (including NMI) is used to resume execution after a HLT instruction, the saved instruction pointer (CS:EIP) points to the instruction following the HLT instruction.

Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

This site uses Akismet to reduce spam. Learn how your comment data is processed.