Sungju's Slow Life

Personal journal


Clocksource tsc unstable

  • The reason of the below message
2013-07-16T05:00:05.181538-04:00 xxxxxx kernel: Clocksource tsc unstable (delta = -95170507948 ns).  Enable clocksource failover by adding clocksource_failover kernel parameter.
  • clocksource structure
/**
* struct clocksource - hardware abstraction for a free running counter
*  Provides mostly state-free accessors to the underlying hardware.
*  This is the structure used for system time.
*
* @name:       ptr to clocksource name
* @list:       list head for registration
* @rating:     rating value for selection (higher is better)
*          To avoid rating inflation the following
*          list should give you a guide as to how
*          to assign your clocksource a rating
*          1-99: Unfit for real use
*              Only available for bootup and testing purposes.
*          100-199: Base level usability.
*              Functional for real use, but not desired.
*          200-299: Good.
*              A correct and usable clocksource.
*          300-399: Desired.
*              A reasonably fast and accurate clocksource.
*          400-499: Perfect
*              The ideal clocksource. A must-use where
*              available.
* @read:       returns a cycle value, passes clocksource as argument
* @enable:     optional function to enable the clocksource
* @disable:        optional function to disable the clocksource
* @mask:       bitmask for two's complement
*          subtraction of non 64 bit counters
* @mult:       cycle to nanosecond multiplier
* @shift:      cycle to nanosecond divisor (power of two)
* @max_idle_ns:    max idle time permitted by the clocksource (nsecs)
* @maxadj:     maximum adjustment value to mult (~11%)
* @flags:      flags describing special properties
* @archdata:       arch-specific data
* @suspend:        suspend function for the clocksource, if necessary
* @resume:     resume function for the clocksource, if necessary
* @cycle_last:     most recent cycle counter value seen by ::read()
* @owner:      module reference, must be set by clocksource in modules
*/
struct clocksource {
/*
* Hotpath data, fits in a single cache line when the
* clocksource itself is cacheline aligned.
*/
cycle_t (*read)(struct clocksource *cs);
cycle_t cycle_last;
cycle_t mask;
u32 mult;
u32 shift;
u64 max_idle_ns;
u32 maxadj;
#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA
struct arch_clocksource_data archdata;
#endif

const char *name;
struct list_head list;
int rating;
int (*enable)(struct clocksource *cs);
void (*disable)(struct clocksource *cs);
unsigned long flags;
void (*suspend)(struct clocksource *cs);
void (*resume)(struct clocksource *cs);

/* private: */
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
/* Watchdog related data, used by the framework */
struct list_head wd_list;
cycle_t cs_last;
cycle_t wd_last;
#endif
struct module *owner;
} ____cacheline_aligned;
  • This message is generated from the watchdog timer function
  • clocksource_watchdog() is handled with watchdog_timer and started in clocksource_start_watchdog()
static struct timer_list watchdog_timer;

static inline void clocksource_start_watchdog(void)
{
if (watchdog_running || !watchdog || list_empty(&watchdog_list))
return;
init_timer(&watchdog_timer);
watchdog_timer.function = clocksource_watchdog;
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
watchdog_running = 1;
}
  • clocksource_watchdog() is checking each clock sources and check if there’s any big differences in the time value.
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs;
cycle_t csnow, wdnow;
int64_t wd_nsec, cs_nsec;
int next_cpu;

spin_lock(&watchdog_lock);
if (!watchdog_running)
goto out;

list_for_each_entry(cs, &watchdog_list, wd_list) {

/* Clocksource already marked unstable? */
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
if (finished_booting)
schedule_work(&watchdog_work);
continue;
}

local_irq_disable();
csnow = cs->read(cs);
wdnow = watchdog->read(watchdog);
local_irq_enable();

/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
cs->flags |= CLOCK_SOURCE_WATCHDOG;
cs->wd_last = wdnow;
cs->cs_last = csnow;
continue;
}

wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
watchdog->mult, watchdog->shift);

cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
cs->mask, cs->mult, cs->shift);
cs->cs_last = csnow;
cs->wd_last = wdnow;

/* Check the deviation from the watchdog clocksource. */
if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
if (clocksource_failover)
clocksource_unstable(cs, cs_nsec - wd_nsec);
else
printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns).  Enable clocksource failover by adding clocksource_failover kernel parameter.n",
cs->name, cs_nsec - wd_nsec);
continue;
}

if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
/*
* We just marked the clocksource as highres-capable,
* notify the rest of the system as well so that we
* transition into high-res mode:
*/
tick_clock_notify();
}
}

/*
* Cycle through CPUs to check if the CPUs stay synchronized
* to each other.
*/
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(cpu_online_mask);
watchdog_timer.expires += WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, next_cpu);
out:
spin_unlock(&watchdog_lock);
}
  • If the deviation is bigger than WATCHDOG_THRESHOLD, the clocksource will be marked as unstable.
static void clocksource_unstable(struct clocksource *cs, int64_t delta)
{
printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)n",
cs->name, delta);
__clocksource_unstable(cs);
}

static void __clocksource_unstable(struct clocksource *cs)
{
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
if (finished_booting)
schedule_work(&watchdog_work);
}
  • Why this checking is happening?
  • clocksource is used to get smaller time differences than the time interrupt can handles (100ms or 10ms depends on kernel HZ).
  • By checking the clock source value, we can check if the clock source is providing reliable value.
/* Timekeeper helper functions. */
static inline s64 timekeeping_get_ns(void)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;

/* read clocksource: */
clock = timekeeper.clock;
cycle_now = clock->read(clock);

/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;

/* return delta convert to nanoseconds using ntp adjusted mult. */
return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
timekeeper.shift);
}
  • Start a kernel thread ‘kwatchdog’ to change the clock source.
static void clocksource_watchdog_work(struct work_struct *work)
{
/*
* If kthread_run fails the next watchdog scan over the
* watchdog_list will find the unstable clock again.
*/
kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
}
  • clocksource_watchdog_kthread is checking if new clocksource is needed and do call clocksource_select() to change it
static int clocksource_watchdog_kthread(void *data)
{
mutex_lock(&clocksource_mutex);
if (__clocksource_watchdog_kthread())
clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
}
  • Check if there’s any clocksource which is unstable (CLOCK_SOURCE_UNSTABLE) or needs to be re-selected (CLOCK_SOURCE_RESELECT).
static int __clocksource_watchdog_kthread(void)
{
struct clocksource *cs, *tmp;
unsigned long flags;
LIST_HEAD(unstable);
int select = 0;

spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
list_del_init(&cs->wd_list);
list_add(&cs->wd_list, &unstable);
select = 1;
}
if (cs->flags & CLOCK_SOURCE_RESELECT) {
cs->flags &= ~CLOCK_SOURCE_RESELECT;
select = 1;
}
}
/* Check if the watchdog timer needs to be stopped. */
clocksource_stop_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);

/* Needs to be done outside of watchdog lock */
list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
list_del_init(&cs->wd_list);
__clocksource_change_rating(cs, 0);
}
return select;
}
  • It stops the watchdog first by removing the watchdog_timer from the timer list.
static inline void clocksource_stop_watchdog(void)
{
if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
return;
del_timer(&watchdog_timer);
watchdog_running = 0;
}
  • During this deletion, the clocksource that had an issue will have lower rating value by calling the below. The clocksource with the bigger value in rating field will have a higher priority which will have a higher chance to be selected.
static void __clocksource_change_rating(struct clocksource *cs, int rating)
{
list_del(&cs->list);
cs->rating = rating;
clocksource_enqueue(cs);
}

/*
* Enqueue the clocksource sorted by rating
*/
static void clocksource_enqueue(struct clocksource *cs)
{
struct list_head *entry = &clocksource_list;
struct clocksource *tmp;

list_for_each_entry(tmp, &clocksource_list, list)
/* Keep track of the place, where to insert */
if (tmp->rating >= cs->rating)
entry = &tmp->list;
list_add(&cs->list, entry);
}
  • clocksource_select() which is called from clocksource_watchdog_kthread() will select a clocksource with highest ‘rating’ value.
/**
* clocksource_select - Select the best clocksource available
*
* Private function. Must hold clocksource_mutex when called.
*
* Select the clocksource with the best rating, or the clocksource,
* which is selected by userspace override.
*/
static void clocksource_select(void)
{
return __clocksource_select(false);
}

static void __clocksource_select(bool skipcur)
{
bool oneshot = tick_oneshot_mode_active();
struct clocksource *best, *cs;

/* Find the best suitable clocksource */
best = clocksource_find_best(oneshot, skipcur);
if (!best)
return;

/* Check for the override clocksource. */
list_for_each_entry(cs, &clocksource_list, list) {
if (skipcur && cs == curr_clocksource)
continue;
if (strcmp(cs->name, override_name) != 0)
continue;
/*
* Check to make sure we don't switch to a non-highres
* capable clocksource if the tick code is in oneshot
* mode (highres or nohz)
*/
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
/* Override clocksource cannot be used. */
printk(KERN_WARNING "Override clocksource %s is not "
"HRT compatible. Cannot switch while in "
"HRT/NOHZ moden", cs->name);
override_name[0] = 0;
} else
/* Override clocksource can be used. */
best = cs;
break;
}

if (curr_clocksource != best && !timekeeping_notify(best)) {
pr_info("Switched to clocksource %sn", best->name);
curr_clocksource = best;
}
}

static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
{
struct clocksource *cs;

if (!finished_booting || list_empty(&clocksource_list))
return NULL;

/*
* We pick the clocksource with the highest rating. If oneshot
* mode is active, we pick the highres valid clocksource with
* the best rating.
*/
list_for_each_entry(cs, &clocksource_list, list) {
if (skipcur && cs == curr_clocksource)
continue;
if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
continue;
return cs;
}
return NULL;
}


Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

This site uses Akismet to reduce spam. Learn how your comment data is processed.

%d bloggers like this: