Automatic NUMA Balancing
It is described in Documentation/sysctl/kernel.txt
numa_balancing
Enables/disables automatic page fault based NUMA memory
balancing. Memory is moved automatically to nodes
that access it often.Enables/disables automatic NUMA memory balancing. On NUMA machines, there
is a performance penalty if remote memory is accessed by a CPU. When this
feature is enabled the kernel samples what task thread is accessing memory
by periodically unmapping pages and later trapping a page fault. At the
time of the page fault, it is determined if the data being accessed should
be migrated to a local memory node.The unmapping of pages and trapping faults incur additional overhead that
ideally is offset by improved memory locality but there is no universal
guarantee. If the target workload is already bound to NUMA nodes then this
feature should be disabled. Otherwise, if the system overhead from the
feature is too high then the rate the kernel samples for NUMA hinting
faults may be controlled by the numa_balancing_scan_period_min_ms,
numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
It’s defined in kernel/sysctl.c
394 { 395 .procname = "numa_balancing", 396 .data = NULL, /* filled in by handler */ 397 .maxlen = sizeof(unsigned int), 398 .mode = 0644, 399 .proc_handler = sysctl_numa_balancing, 400 .extra1 = &zero, 401 .extra2 = &one, 402 },
There are related sysctl parameters also.
kernel.numa_balancing_scan_delay_ms = 1000 kernel.numa_balancing_scan_period_max_ms = 60000 kernel.numa_balancing_scan_period_min_ms = 1000 kernel.numa_balancing_scan_size_mb = 256 kernel.numa_balancing_settle_count = 4
sysctl proc handler is definined in kernel/sched/core.c
#ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; int state = numabalancing_enabled; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; t = *table; t.data = &state; err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; if (write) set_numabalancing_state(state); return err; } #endif #endif #ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_SCHED_DEBUG void set_numabalancing_state(bool enabled) { if (enabled) sched_feat_set("NUMA"); else sched_feat_set("NO_NUMA"); } #else __read_mostly bool numabalancing_enabled; void set_numabalancing_state(bool enabled) { numabalancing_enabled = enabled; } #endif /* CONFIG_SCHED_DEBUG */
There are two locations that is using ‘numabalancing_enabled’.
1713 /* 1714 * Got a PROT_NONE fault for a page on @node. 1715 */ 1716 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) 1717 { 1718 struct task_struct *p = current; 1719 bool migrated = flags & TNF_MIGRATED; 1720 int cpu_node = task_node(current); 1721 int priv; 1722 1723 if (!numabalancing_enabled) 1724 return; 1725 1726 /* for example, ksmd faulting in a user's mm */ 1727 if (!p->mm) 1728 return; 1729 1730 /* Do not worry about placement if exiting */ 1731 if (p->state == TASK_DEAD) 1732 return; 1733 1734 /* Allocate buffer to track faults on a per-node basis */ 1735 if (unlikely(!p->numa_faults_memory)) { 1736 int size = sizeof(*p->numa_faults_memory) * 1737 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; 1738 1739 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); 1740 if (!p->numa_faults_memory) 1741 return; 1742 1743 BUG_ON(p->numa_faults_buffer_memory); 1744 /* 1745 * The averaged statistics, shared & private, memory & cpu, 1746 * occupy the first half of the array. The second half of the 1747 * array is for current counters, which are averaged into the 1748 * first set by task_numa_placement. 1749 */ 1750 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); 1751 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids ); 1752 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); 1753 p->total_numa_faults = 0; 1754 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1755 } 1756 1757 /* 1758 * First accesses are treated as private, otherwise consider accesses 1759 * to be private if the accessing pid has not changed 1760 */ 1761 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { 1762 priv = 1; 1763 } else { 1764 priv = cpupid_match_pid(p, last_cpupid); 1765 if (!priv && !(flags & TNF_NO_GROUP)) 1766 task_numa_group(p, last_cpupid, flags, &priv); 1767 } 1768 1769 task_numa_placement(p); 1770 1771 /* 1772 * Retry task to preferred node migration periodically, in case it 1773 * case it previously failed, or the scheduler moved us. 1774 */ 1775 if (time_after(jiffies, p->numa_migrate_retry)) 1776 numa_migrate_preferred(p); 1777 1778 if (migrated) 1779 p->numa_pages_migrated += pages; 1780 1781 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; 1782 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; 1783 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1784 }
It calls ‘task_numa_placement()’ which checks preferred node and ‘numa_migrate_preferred()’ which does actual migration.
1463 static void task_numa_placement(struct task_struct *p) 1464 { 1465 int seq, nid, max_nid = -1, max_group_nid = -1; 1466 unsigned long max_faults = 0, max_group_faults = 0; 1467 unsigned long fault_types[2] = { 0, 0 }; 1468 unsigned long total_faults; 1469 u64 runtime, period; 1470 spinlock_t *group_lock = NULL; 1471 1472 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1473 if (p->numa_scan_seq == seq) 1474 return; 1475 p->numa_scan_seq = seq; 1476 p->numa_scan_period_max = task_scan_max(p); 1477 1478 total_faults = p->numa_faults_locality[0] + 1479 p->numa_faults_locality[1]; 1480 runtime = numa_get_avg_runtime(p, &period); 1481 1482 /* If the task is part of a group prevent parallel updates to group stats */ 1483 if (p->numa_group) { 1484 group_lock = &p->numa_group->lock; 1485 spin_lock(group_lock); 1486 } 1487 1488 /* Find the node with the highest number of faults */ 1489 for_each_online_node(nid) { 1490 unsigned long faults = 0, group_faults = 0; 1491 int priv, i; 1492 1493 for (priv = 0; priv numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; 1500 fault_types[priv] += p->numa_faults_buffer_memory[i]; 1501 p->numa_faults_buffer_memory[i] = 0; 1502 1503 /* 1504 * Normalize the faults_from, so all tasks in a group 1505 * count according to CPU use, instead of by the raw 1506 * number of faults. Tasks with little runtime have 1507 * little over-all impact on throughput, and thus their 1508 * faults are less important. 1509 */ 1510 f_weight = div64_u64(runtime <numa_faults_buffer_cpu[i]) / 1512 (total_faults + 1); 1513 f_diff = f_weight - p->numa_faults_cpu[i] / 2; 1514 p->numa_faults_buffer_cpu[i] = 0; 1515 1516 p->numa_faults_memory[i] += diff; 1517 p->numa_faults_cpu[i] += f_diff; 1518 faults += p->numa_faults_memory[i]; 1519 p->total_numa_faults += diff; 1520 if (p->numa_group) { 1521 /* safe because we can only change our own group */ 1522 p->numa_group->faults[i] += diff; 1523 p->numa_group->faults_cpu[i] += f_diff; 1524 p->numa_group->total_faults += diff; 1525 group_faults += p->numa_group->faults[i]; 1526 } 1527 } 1528 1529 if (faults > max_faults) { 1530 max_faults = faults; 1531 max_nid = nid; 1532 } 1533 1534 if (group_faults > max_group_faults) { 1535 max_group_faults = group_faults; 1536 max_group_nid = nid; 1537 } 1538 } 1539 1540 update_task_scan_period(p, fault_types[0], fault_types[1]); 1541 1542 if (p->numa_group) { 1543 update_numa_active_node_mask(p->numa_group); 1544 /* 1545 * If the preferred task and group nids are different, 1546 * iterate over the nodes again to find the best place. 1547 */ 1548 if (max_nid != max_group_nid) { 1549 unsigned long weight, max_weight = 0; 1550 1551 for_each_online_node(nid) { 1552 weight = task_weight(p, nid) + group_weight(p, nid); 1553 if (weight > max_weight) { 1554 max_weight = weight; 1555 max_nid = nid; 1556 } 1557 } 1558 } 1559 1560 spin_unlock(group_lock); 1561 } 1562 1563 /* Preferred node as the node with the most faults */ 1564 if (max_faults && max_nid != p->numa_preferred_nid) { 1565 /* Update the preferred nid and migrate task if possible */ 1566 sched_setnuma(p, max_nid); 1567 numa_migrate_preferred(p); 1568 } 1569 }
Actual migration happens in the below functions.
1310 /* Attempt to migrate a task to a CPU on the preferred node. */ 1311 static void numa_migrate_preferred(struct task_struct *p) 1312 { 1313 /* This task has no NUMA fault statistics yet */ 1314 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) 1315 return; 1316 1317 /* Periodically retry migrating the task to the preferred node */ 1318 p->numa_migrate_retry = jiffies + HZ; 1319 1320 /* Success if task is already running on preferred CPU */ 1321 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) 1322 return; 1323 1324 /* Otherwise, try migrate to a CPU on the preferred node */ 1325 task_numa_migrate(p); 1326 } 1212 static int task_numa_migrate(struct task_struct *p) 1213 { 1214 struct task_numa_env env = { 1215 .p = p, 1216 1217 .src_cpu = task_cpu(p), 1218 .src_nid = task_node(p), 1219 1220 .imbalance_pct = 112, 1221 1222 .best_task = NULL, 1223 .best_imp = 0, 1224 .best_cpu = -1 1225 }; 1226 struct sched_domain *sd; 1227 unsigned long taskweight, groupweight; 1228 int nid, ret; 1229 long taskimp, groupimp; 1230 1231 /* 1232 * Pick the lowest SD_NUMA domain, as that would have the smallest 1233 * imbalance and would be the first to start moving tasks about. 1234 * 1235 * And we want to avoid any moving of tasks about, as that would create 1236 * random movement of tasks -- counter the numa conditions we're trying 1237 * to satisfy here. 1238 */ 1239 rcu_read_lock(); 1240 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); 1241 if (sd) 1242 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; 1243 rcu_read_unlock(); 1244 1245 /* 1246 * Cpusets can break the scheduler domain tree into smaller 1247 * balance domains, some of which do not cross NUMA boundaries. 1248 * Tasks that are "trapped" in such domains cannot be migrated 1249 * elsewhere, so there is no point in (re)trying. 1250 */ 1251 if (unlikely(!sd)) { 1252 p->numa_preferred_nid = cpu_to_node(task_cpu(p)); 1253 return -EINVAL; 1254 } 1255 1256 taskweight = task_weight(p, env.src_nid); 1257 groupweight = group_weight(p, env.src_nid); 1258 update_numa_stats(&env.src_stats, env.src_nid); 1259 env.dst_nid = p->numa_preferred_nid; 1260 taskimp = task_weight(p, env.dst_nid) - taskweight; 1261 groupimp = group_weight(p, env.dst_nid) - groupweight; 1262 update_numa_stats(&env.dst_stats, env.dst_nid); 1263 1264 /* If the preferred nid has capacity, try to use it. */ 1265 if (env.dst_stats.has_capacity) 1266 task_numa_find_cpu(&env, taskimp, groupimp); 1267 1268 /* No space available on the preferred nid. Look elsewhere. */ 1269 if (env.best_cpu == -1) { 1270 for_each_online_node(nid) { 1271 if (nid == env.src_nid || nid == p->numa_preferred_nid) 1272 continue; 1273 1274 /* Only consider nodes where both task and groups benefit */ 1275 taskimp = task_weight(p, nid) - taskweight; 1276 groupimp = group_weight(p, nid) - groupweight; 1277 if (taskimp < 0 && groupimp numa_scan_period = task_scan_min(p); 1297 1298 if (env.best_task == NULL) { 1299 if ((ret = migrate_task_to(p, env.best_cpu)) != 0) 1300 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); 1301 return ret; 1302 } 1303 1304 if ((ret = migrate_swap(p, env.best_task)) != 0) 1305 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); 1306 put_task_struct(env.best_task); 1307 return ret; 1308 }
Below calls during scheduling in schedule().
/* * scheduler tick hitting a task of our scheduling class: */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } if (numabalancing_enabled) task_tick_numa(rq, curr); update_rq_runnable_avg(rq, 1); }
Actua call to check numa faults.
/* * Drive the periodic memory faults.. */ void task_tick_numa(struct rq *rq, struct task_struct *curr) { struct callback_head *work = &curr->numa_work; u64 period, now; /* * We don't care about NUMA placement if we don't have memory. */ if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) return; /* * Using runtime rather than walltime has the dual advantage that * we (mostly) drive the selection from busy threads and that the * task needs to have done some actual work before we bother with * NUMA placement. */ now = curr->se.sum_exec_runtime; period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; if (now - curr->node_stamp > period) { if (!curr->node_stamp) curr->numa_scan_period = task_scan_min(curr); curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) { init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ task_work_add(curr, work, true); } } } #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { }
It calls task_numa_work as a call back.
static inline void init_task_work(struct callback_head *twork, task_work_func_t func) { twork->func = func; } int task_work_add(struct task_struct *task, struct callback_head *twork, bool); struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t); void task_work_run(void); static inline void exit_task_work(struct task_struct *task) { task_work_run(); } /* * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). */ void task_numa_work(struct callback_head *work) { unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; long pages; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); work->next = work; /* protect against double add */ /* * Who cares about NUMA placement when they're dying. * * NOTE: make sure not to dereference p->mm before this check, * exit_task_work() happens _after_ exit_mm() so we could be called * without p->mm even though we still had it when we enqueued this * work. */ if (p->flags & PF_EXITING) return; if (!mm->numa_next_scan) { mm->numa_next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); } /* * Enforce maximal scan/migration frequency.. */ migrate = mm->numa_next_scan; if (time_before(now, migrate)) return; if (p->numa_scan_period == 0) { p->numa_scan_period_max = task_scan_max(p); p->numa_scan_period = task_scan_min(p); } next_scan = now + msecs_to_jiffies(p->numa_scan_period); if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; /* * Delay this task enough that another task of this mm will likely win * the next time around. */ p->node_stamp += 2 * TICK_NSEC; start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <mmap_sem); vma = find_vma(mm, start); if (!vma) { reset_ptenuma_scan(p); start = 0; vma = mm->mmap; } for (; vma; vma = vma->vm_next) { if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) continue; /* * Shared library pages mapped by multiple processes are not * migrated as it is expected they are cache replicated. Avoid * hinting faults in read-only file-backed mappings or the vdso * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) continue; /* * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) continue; do { start = max(start, vma->vm_start); end = ALIGN(start + (pages <vm_end); nr_pte_updates += change_prot_numa(vma, start, end); /* * Scan sysctl_numa_balancing_scan_size but ensure that * at least one PTE is updated so that unused virtual * address space is quickly skipped. */ if (nr_pte_updates) pages -= (end - start) >> PAGE_SHIFT; start = end; if (pages vm_end); } out: /* * It is possible to reach the end of the VMA list but the last few * VMAs are not guaranteed to the vma_migratable. If they are not, we * would find the !migratable VMA on the next scan but not reset the * scanner to the start so check it now. */ if (vma) mm->numa_scan_offset = start; else reset_ptenuma_scan(p); up_read(&mm->mmap_sem); }
Leave a Reply