Sungju's Slow Life

Personal journal


Personal memo for ‘Automatic NUMA Balancing’

Automatic NUMA Balancing

It is described in Documentation/sysctl/kernel.txt

numa_balancing

Enables/disables automatic page fault based NUMA memory
balancing. Memory is moved automatically to nodes
that access it often.

Enables/disables automatic NUMA memory balancing. On NUMA machines, there
is a performance penalty if remote memory is accessed by a CPU. When this
feature is enabled the kernel samples what task thread is accessing memory
by periodically unmapping pages and later trapping a page fault. At the
time of the page fault, it is determined if the data being accessed should
be migrated to a local memory node.

The unmapping of pages and trapping faults incur additional overhead that
ideally is offset by improved memory locality but there is no universal
guarantee. If the target workload is already bound to NUMA nodes then this
feature should be disabled. Otherwise, if the system overhead from the
feature is too high then the rate the kernel samples for NUMA hinting
faults may be controlled by the numa_balancing_scan_period_min_ms,
numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.

It’s defined in kernel/sysctl.c

 394   {
 395     .procname = "numa_balancing",
 396     .data   = NULL, /* filled in by handler */
 397     .maxlen   = sizeof(unsigned int),
 398     .mode   = 0644,
 399     .proc_handler = sysctl_numa_balancing,
 400     .extra1   = &zero,
 401     .extra2   = &one,
 402   },

There are related sysctl parameters also.

 
kernel.numa_balancing_scan_delay_ms = 1000
kernel.numa_balancing_scan_period_max_ms = 60000
kernel.numa_balancing_scan_period_min_ms = 1000
kernel.numa_balancing_scan_size_mb = 256
kernel.numa_balancing_settle_count = 4

sysctl proc handler is definined in kernel/sched/core.c

#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
       void __user *buffer, size_t *lenp, loff_t *ppos)
{
  struct ctl_table t;
  int err;
  int state = numabalancing_enabled;

  if (write && !capable(CAP_SYS_ADMIN))
    return -EPERM;

  t = *table;
  t.data = &state;
  err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
  if (err < 0)
    return err;
  if (write)
    set_numabalancing_state(state);
  return err;
}
#endif
#endif


#ifdef CONFIG_NUMA_BALANCING
#ifdef CONFIG_SCHED_DEBUG
void set_numabalancing_state(bool enabled)
{
  if (enabled)
    sched_feat_set("NUMA");
  else
    sched_feat_set("NO_NUMA");
}
#else
__read_mostly bool numabalancing_enabled;

void set_numabalancing_state(bool enabled)
{
  numabalancing_enabled = enabled;
}
#endif /* CONFIG_SCHED_DEBUG */

There are two locations that is using ‘numabalancing_enabled’.

1713 /*
1714  * Got a PROT_NONE fault for a page on @node.
1715  */
1716 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1717 {
1718   struct task_struct *p = current;
1719   bool migrated = flags & TNF_MIGRATED;
1720   int cpu_node = task_node(current);
1721   int priv;
1722 
1723   if (!numabalancing_enabled)
1724     return;
1725 
1726   /* for example, ksmd faulting in a user's mm */
1727   if (!p->mm)
1728     return;
1729 
1730   /* Do not worry about placement if exiting */
1731   if (p->state == TASK_DEAD)
1732     return;
1733 
1734   /* Allocate buffer to track faults on a per-node basis */
1735   if (unlikely(!p->numa_faults_memory)) {
1736     int size = sizeof(*p->numa_faults_memory) *
1737          NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1738 
1739     p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1740     if (!p->numa_faults_memory)
1741       return;
1742 
1743     BUG_ON(p->numa_faults_buffer_memory);
1744     /*
1745      * The averaged statistics, shared & private, memory & cpu,
1746      * occupy the first half of the array. The second half of the
1747      * array is for current counters, which are averaged into the
1748      * first set by task_numa_placement.
1749      */
1750     p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1751     p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids     );
1752     p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1753     p->total_numa_faults = 0;
1754     memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1755   }
1756 
1757   /*
1758    * First accesses are treated as private, otherwise consider accesses
1759    * to be private if the accessing pid has not changed
1760    */
1761   if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
1762     priv = 1;
1763   } else {
1764     priv = cpupid_match_pid(p, last_cpupid);
1765     if (!priv && !(flags & TNF_NO_GROUP))
1766       task_numa_group(p, last_cpupid, flags, &priv);
1767   }
1768 
1769   task_numa_placement(p);
1770 
1771   /*
1772    * Retry task to preferred node migration periodically, in case it
1773    * case it previously failed, or the scheduler moved us.
1774    */
1775   if (time_after(jiffies, p->numa_migrate_retry))
1776     numa_migrate_preferred(p);
1777 
1778   if (migrated)
1779     p->numa_pages_migrated += pages;
1780 
1781   p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1782   p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1783   p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1784 }

It calls ‘task_numa_placement()’ which checks preferred node and ‘numa_migrate_preferred()’ which does actual migration.

1463 static void task_numa_placement(struct task_struct *p)
1464 {
1465   int seq, nid, max_nid = -1, max_group_nid = -1;
1466   unsigned long max_faults = 0, max_group_faults = 0;
1467   unsigned long fault_types[2] = { 0, 0 };
1468   unsigned long total_faults;
1469   u64 runtime, period;
1470   spinlock_t *group_lock = NULL;
1471 
1472   seq = ACCESS_ONCE(p->mm->numa_scan_seq);
1473   if (p->numa_scan_seq == seq)
1474     return;
1475   p->numa_scan_seq = seq;
1476   p->numa_scan_period_max = task_scan_max(p);
1477 
1478   total_faults = p->numa_faults_locality[0] +
1479            p->numa_faults_locality[1];
1480   runtime = numa_get_avg_runtime(p, &period);
1481 
1482   /* If the task is part of a group prevent parallel updates to group stats */
1483   if (p->numa_group) {
1484     group_lock = &p->numa_group->lock;
1485     spin_lock(group_lock);
1486   }
1487 
1488   /* Find the node with the highest number of faults */
1489   for_each_online_node(nid) {
1490     unsigned long faults = 0, group_faults = 0;
1491     int priv, i;
1492 
1493     for (priv = 0; priv numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
1500       fault_types[priv] += p->numa_faults_buffer_memory[i];
1501       p->numa_faults_buffer_memory[i] = 0;
1502 
1503       /*
1504        * Normalize the faults_from, so all tasks in a group
1505        * count according to CPU use, instead of by the raw
1506        * number of faults. Tasks with little runtime have
1507        * little over-all impact on throughput, and thus their
1508        * faults are less important.
1509        */
1510       f_weight = div64_u64(runtime <numa_faults_buffer_cpu[i]) /
1512            (total_faults + 1);
1513       f_diff = f_weight - p->numa_faults_cpu[i] / 2;
1514       p->numa_faults_buffer_cpu[i] = 0;
1515 
1516       p->numa_faults_memory[i] += diff;
1517       p->numa_faults_cpu[i] += f_diff;
1518       faults += p->numa_faults_memory[i];
1519       p->total_numa_faults += diff;
1520       if (p->numa_group) {
1521         /* safe because we can only change our own group */
1522         p->numa_group->faults[i] += diff;
1523         p->numa_group->faults_cpu[i] += f_diff;
1524         p->numa_group->total_faults += diff;
1525         group_faults += p->numa_group->faults[i];
1526       }
1527     }
1528 
1529     if (faults > max_faults) {
1530       max_faults = faults;
1531       max_nid = nid;
1532     }
1533 
1534     if (group_faults > max_group_faults) {
1535       max_group_faults = group_faults;
1536       max_group_nid = nid;
1537     }
1538   }
1539 
1540   update_task_scan_period(p, fault_types[0], fault_types[1]);
1541 
1542   if (p->numa_group) {
1543     update_numa_active_node_mask(p->numa_group);
1544     /*
1545      * If the preferred task and group nids are different,
1546      * iterate over the nodes again to find the best place.
1547      */
1548     if (max_nid != max_group_nid) {
1549       unsigned long weight, max_weight = 0;
1550 
1551       for_each_online_node(nid) {
1552         weight = task_weight(p, nid) + group_weight(p, nid);
1553         if (weight > max_weight) {
1554           max_weight = weight;
1555           max_nid = nid;
1556         }
1557       }
1558     }
1559 
1560     spin_unlock(group_lock);
1561   }
1562 
1563   /* Preferred node as the node with the most faults */
1564   if (max_faults && max_nid != p->numa_preferred_nid) {
1565     /* Update the preferred nid and migrate task if possible */
1566     sched_setnuma(p, max_nid);
1567     numa_migrate_preferred(p);
1568   }
1569 }

Actual migration happens in the below functions.

1310 /* Attempt to migrate a task to a CPU on the preferred node. */
1311 static void numa_migrate_preferred(struct task_struct *p)
1312 {
1313   /* This task has no NUMA fault statistics yet */
1314   if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1315     return;
1316 
1317   /* Periodically retry migrating the task to the preferred node */
1318   p->numa_migrate_retry = jiffies + HZ;
1319 
1320   /* Success if task is already running on preferred CPU */
1321   if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
1322     return;
1323 
1324   /* Otherwise, try migrate to a CPU on the preferred node */
1325   task_numa_migrate(p);
1326 }


1212 static int task_numa_migrate(struct task_struct *p)
1213 {
1214   struct task_numa_env env = {
1215     .p = p,
1216     
1217     .src_cpu = task_cpu(p),
1218     .src_nid = task_node(p),
1219     
1220     .imbalance_pct = 112,
1221     
1222     .best_task = NULL,
1223     .best_imp = 0,
1224     .best_cpu = -1
1225   };
1226   struct sched_domain *sd;
1227   unsigned long taskweight, groupweight;
1228   int nid, ret; 
1229   long taskimp, groupimp;
1230   
1231   /*
1232    * Pick the lowest SD_NUMA domain, as that would have the smallest
1233    * imbalance and would be the first to start moving tasks about.
1234    *
1235    * And we want to avoid any moving of tasks about, as that would create
1236    * random movement of tasks -- counter the numa conditions we're trying
1237    * to satisfy here.
1238    */
1239   rcu_read_lock();
1240   sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1241   if (sd)
1242     env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1243   rcu_read_unlock();
1244 
1245   /*
1246    * Cpusets can break the scheduler domain tree into smaller
1247    * balance domains, some of which do not cross NUMA boundaries.
1248    * Tasks that are "trapped" in such domains cannot be migrated
1249    * elsewhere, so there is no point in (re)trying.
1250    */
1251   if (unlikely(!sd)) {
1252     p->numa_preferred_nid = cpu_to_node(task_cpu(p));
1253     return -EINVAL;
1254   }
1255 
1256   taskweight = task_weight(p, env.src_nid);
1257   groupweight = group_weight(p, env.src_nid);
1258   update_numa_stats(&env.src_stats, env.src_nid);
1259   env.dst_nid = p->numa_preferred_nid;
1260   taskimp = task_weight(p, env.dst_nid) - taskweight;
1261   groupimp = group_weight(p, env.dst_nid) - groupweight;
1262   update_numa_stats(&env.dst_stats, env.dst_nid);
1263 
1264   /* If the preferred nid has capacity, try to use it. */
1265   if (env.dst_stats.has_capacity)
1266     task_numa_find_cpu(&env, taskimp, groupimp);
1267 
1268   /* No space available on the preferred nid. Look elsewhere. */
1269   if (env.best_cpu == -1) {
1270     for_each_online_node(nid) {
1271       if (nid == env.src_nid || nid == p->numa_preferred_nid)
1272         continue;
1273 
1274       /* Only consider nodes where both task and groups benefit */
1275       taskimp = task_weight(p, nid) - taskweight;
1276       groupimp = group_weight(p, nid) - groupweight;
1277       if (taskimp < 0 && groupimp numa_scan_period = task_scan_min(p);
1297 
1298   if (env.best_task == NULL) {
1299     if ((ret = migrate_task_to(p, env.best_cpu)) != 0)
1300       trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1301     return ret;
1302   }
1303 
1304   if ((ret = migrate_swap(p, env.best_task)) != 0)
1305     trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1306   put_task_struct(env.best_task);
1307   return ret;
1308 }

Below calls during scheduling in schedule().

/*
 * scheduler tick hitting a task of our scheduling class:
 */
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
  struct cfs_rq *cfs_rq;
  struct sched_entity *se = &curr->se;

  for_each_sched_entity(se) {
    cfs_rq = cfs_rq_of(se);
    entity_tick(cfs_rq, se, queued);
  }

  if (numabalancing_enabled)
    task_tick_numa(rq, curr);

  update_rq_runnable_avg(rq, 1);
}

Actua call to check numa faults.

/*
 * Drive the periodic memory faults..
 */
void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
  struct callback_head *work = &curr->numa_work;
  u64 period, now; 

  /*
   * We don't care about NUMA placement if we don't have memory.
   */
  if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
    return;

  /*
   * Using runtime rather than walltime has the dual advantage that
   * we (mostly) drive the selection from busy threads and that the
   * task needs to have done some actual work before we bother with
   * NUMA placement.
   */
  now = curr->se.sum_exec_runtime;
  period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;

  if (now - curr->node_stamp > period) {
    if (!curr->node_stamp)
      curr->numa_scan_period = task_scan_min(curr);
    curr->node_stamp += period;

    if (!time_before(jiffies, curr->mm->numa_next_scan)) {
      init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
      task_work_add(curr, work, true);
    }    
  }
}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}

It calls task_numa_work as a call back.

static inline void
init_task_work(struct callback_head *twork, task_work_func_t func)
{
  twork->func = func;
}

int task_work_add(struct task_struct *task, struct callback_head *twork, bool);
struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
void task_work_run(void);

static inline void exit_task_work(struct task_struct *task)
{
  task_work_run();
}

/*
 * The expensive part of numa migration is done from task_work context.
 * Triggered from task_tick_numa().
 */
void task_numa_work(struct callback_head *work)
{
  unsigned long migrate, next_scan, now = jiffies;
  struct task_struct *p = current;
  struct mm_struct *mm = p->mm;
  struct vm_area_struct *vma;
  unsigned long start, end;
  unsigned long nr_pte_updates = 0;
  long pages;

  WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));

  work->next = work; /* protect against double add */
  /*
   * Who cares about NUMA placement when they're dying.
   *
   * NOTE: make sure not to dereference p->mm before this check,
   * exit_task_work() happens _after_ exit_mm() so we could be called
   * without p->mm even though we still had it when we enqueued this
   * work.
   */
  if (p->flags & PF_EXITING)
    return;

  if (!mm->numa_next_scan) {
    mm->numa_next_scan = now +
      msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
  }

  /*
   * Enforce maximal scan/migration frequency..
   */
  migrate = mm->numa_next_scan;
  if (time_before(now, migrate))
    return;
  
  if (p->numa_scan_period == 0) {
    p->numa_scan_period_max = task_scan_max(p);
    p->numa_scan_period = task_scan_min(p);
  }
  
  next_scan = now + msecs_to_jiffies(p->numa_scan_period);
  if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
    return;
  
  /*
   * Delay this task enough that another task of this mm will likely win
   * the next time around.
   */
  p->node_stamp += 2 * TICK_NSEC;
  
  start = mm->numa_scan_offset;
  pages = sysctl_numa_balancing_scan_size;
  pages <mmap_sem);
  vma = find_vma(mm, start);
  if (!vma) {
    reset_ptenuma_scan(p);
    start = 0;
    vma = mm->mmap;
  }
  for (; vma; vma = vma->vm_next) {
    if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
      continue;
    
    /*
     * Shared library pages mapped by multiple processes are not
     * migrated as it is expected they are cache replicated. Avoid
     * hinting faults in read-only file-backed mappings or the vdso
     * as migrating the pages will be of marginal benefit.
     */
    if (!vma->vm_mm ||
        (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
      continue;

    /*
     * Skip inaccessible VMAs to avoid any confusion between
     * PROT_NONE and NUMA hinting ptes
     */
    if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
      continue;

    do {
      start = max(start, vma->vm_start);
      end = ALIGN(start + (pages <vm_end);
      nr_pte_updates += change_prot_numa(vma, start, end);

      /*
       * Scan sysctl_numa_balancing_scan_size but ensure that
       * at least one PTE is updated so that unused virtual
       * address space is quickly skipped.
       */
      if (nr_pte_updates)
        pages -= (end - start) >> PAGE_SHIFT;

      start = end;
      if (pages vm_end);
  }

out:
  /*
   * It is possible to reach the end of the VMA list but the last few
   * VMAs are not guaranteed to the vma_migratable. If they are not, we
   * would find the !migratable VMA on the next scan but not reset the
   * scanner to the start so check it now.
   */
  if (vma)
    mm->numa_scan_offset = start;
  else
    reset_ptenuma_scan(p);
  up_read(&mm->mmap_sem);
}


Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

This site uses Akismet to reduce spam. Learn how your comment data is processed.

%d bloggers like this: