linux/kernel/cgroup_pids.c
<<
>>
Prefs
   1/*
   2 * Process number limiting controller for cgroups.
   3 *
   4 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
   5 * after a certain limit is reached.
   6 *
   7 * Since it is trivial to hit the task limit without hitting any kmemcg limits
   8 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
   9 * preventable in the scope of a cgroup hierarchy by allowing resource limiting
  10 * of the number of tasks in a cgroup.
  11 *
  12 * In order to use the `pids` controller, set the maximum number of tasks in
  13 * pids.max (this is not available in the root cgroup for obvious reasons). The
  14 * number of processes currently in the cgroup is given by pids.current.
  15 * Organisational operations are not blocked by cgroup policies, so it is
  16 * possible to have pids.current > pids.max. However, it is not possible to
  17 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
  18 * would cause a cgroup policy to be violated.
  19 *
  20 * To set a cgroup to have no limit, set pids.max to "max". This is the default
  21 * for all new cgroups (N.B. that PID limits are hierarchical, so the most
  22 * stringent limit in the hierarchy is followed).
  23 *
  24 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
  25 * a superset of parent/child/pids.current.
  26 *
  27 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
  28 *
  29 * This file is subject to the terms and conditions of version 2 of the GNU
  30 * General Public License.  See the file COPYING in the main directory of the
  31 * Linux distribution for more details.
  32 */
  33
  34#include <linux/kernel.h>
  35#include <linux/threads.h>
  36#include <linux/atomic.h>
  37#include <linux/cgroup.h>
  38#include <linux/slab.h>
  39
  40#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
  41#define PIDS_MAX_STR "max"
  42
  43struct pids_cgroup {
  44        struct cgroup_subsys_state      css;
  45
  46        /*
  47         * Use 64-bit types so that we can safely represent "max" as
  48         * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
  49         */
  50        atomic64_t                      counter;
  51        int64_t                         limit;
  52
  53        /* Handle for "pids.events" */
  54        struct cgroup_file              events_file;
  55
  56        /* Number of times fork failed because limit was hit. */
  57        atomic64_t                      events_limit;
  58};
  59
  60static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
  61{
  62        return container_of(css, struct pids_cgroup, css);
  63}
  64
  65static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
  66{
  67        return css_pids(pids->css.parent);
  68}
  69
  70static struct cgroup_subsys_state *
  71pids_css_alloc(struct cgroup_subsys_state *parent)
  72{
  73        struct pids_cgroup *pids;
  74
  75        pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
  76        if (!pids)
  77                return ERR_PTR(-ENOMEM);
  78
  79        pids->limit = PIDS_MAX;
  80        atomic64_set(&pids->counter, 0);
  81        atomic64_set(&pids->events_limit, 0);
  82        return &pids->css;
  83}
  84
  85static void pids_css_free(struct cgroup_subsys_state *css)
  86{
  87        kfree(css_pids(css));
  88}
  89
  90/**
  91 * pids_cancel - uncharge the local pid count
  92 * @pids: the pid cgroup state
  93 * @num: the number of pids to cancel
  94 *
  95 * This function will WARN if the pid count goes under 0, because such a case is
  96 * a bug in the pids controller proper.
  97 */
  98static void pids_cancel(struct pids_cgroup *pids, int num)
  99{
 100        /*
 101         * A negative count (or overflow for that matter) is invalid,
 102         * and indicates a bug in the `pids` controller proper.
 103         */
 104        WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
 105}
 106
 107/**
 108 * pids_uncharge - hierarchically uncharge the pid count
 109 * @pids: the pid cgroup state
 110 * @num: the number of pids to uncharge
 111 */
 112static void pids_uncharge(struct pids_cgroup *pids, int num)
 113{
 114        struct pids_cgroup *p;
 115
 116        for (p = pids; parent_pids(p); p = parent_pids(p))
 117                pids_cancel(p, num);
 118}
 119
 120/**
 121 * pids_charge - hierarchically charge the pid count
 122 * @pids: the pid cgroup state
 123 * @num: the number of pids to charge
 124 *
 125 * This function does *not* follow the pid limit set. It cannot fail and the new
 126 * pid count may exceed the limit. This is only used for reverting failed
 127 * attaches, where there is no other way out than violating the limit.
 128 */
 129static void pids_charge(struct pids_cgroup *pids, int num)
 130{
 131        struct pids_cgroup *p;
 132
 133        for (p = pids; parent_pids(p); p = parent_pids(p))
 134                atomic64_add(num, &p->counter);
 135}
 136
 137/**
 138 * pids_try_charge - hierarchically try to charge the pid count
 139 * @pids: the pid cgroup state
 140 * @num: the number of pids to charge
 141 *
 142 * This function follows the set limit. It will fail if the charge would cause
 143 * the new value to exceed the hierarchical limit. Returns 0 if the charge
 144 * succeeded, otherwise -EAGAIN.
 145 */
 146static int pids_try_charge(struct pids_cgroup *pids, int num)
 147{
 148        struct pids_cgroup *p, *q;
 149
 150        for (p = pids; parent_pids(p); p = parent_pids(p)) {
 151                int64_t new = atomic64_add_return(num, &p->counter);
 152
 153                /*
 154                 * Since new is capped to the maximum number of pid_t, if
 155                 * p->limit is %PIDS_MAX then we know that this test will never
 156                 * fail.
 157                 */
 158                if (new > p->limit)
 159                        goto revert;
 160        }
 161
 162        return 0;
 163
 164revert:
 165        for (q = pids; q != p; q = parent_pids(q))
 166                pids_cancel(q, num);
 167        pids_cancel(p, num);
 168
 169        return -EAGAIN;
 170}
 171
 172static int pids_can_attach(struct cgroup_taskset *tset)
 173{
 174        struct task_struct *task;
 175        struct cgroup_subsys_state *dst_css;
 176
 177        cgroup_taskset_for_each(task, dst_css, tset) {
 178                struct pids_cgroup *pids = css_pids(dst_css);
 179                struct cgroup_subsys_state *old_css;
 180                struct pids_cgroup *old_pids;
 181
 182                /*
 183                 * No need to pin @old_css between here and cancel_attach()
 184                 * because cgroup core protects it from being freed before
 185                 * the migration completes or fails.
 186                 */
 187                old_css = task_css(task, pids_cgrp_id);
 188                old_pids = css_pids(old_css);
 189
 190                pids_charge(pids, 1);
 191                pids_uncharge(old_pids, 1);
 192        }
 193
 194        return 0;
 195}
 196
 197static void pids_cancel_attach(struct cgroup_taskset *tset)
 198{
 199        struct task_struct *task;
 200        struct cgroup_subsys_state *dst_css;
 201
 202        cgroup_taskset_for_each(task, dst_css, tset) {
 203                struct pids_cgroup *pids = css_pids(dst_css);
 204                struct cgroup_subsys_state *old_css;
 205                struct pids_cgroup *old_pids;
 206
 207                old_css = task_css(task, pids_cgrp_id);
 208                old_pids = css_pids(old_css);
 209
 210                pids_charge(old_pids, 1);
 211                pids_uncharge(pids, 1);
 212        }
 213}
 214
 215/*
 216 * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
 217 * on threadgroup_change_begin() held by the copy_process().
 218 */
 219static int pids_can_fork(struct task_struct *task)
 220{
 221        struct cgroup_subsys_state *css;
 222        struct pids_cgroup *pids;
 223        int err;
 224
 225        css = task_css_check(current, pids_cgrp_id, true);
 226        pids = css_pids(css);
 227        err = pids_try_charge(pids, 1);
 228        if (err) {
 229                /* Only log the first time events_limit is incremented. */
 230                if (atomic64_inc_return(&pids->events_limit) == 1) {
 231                        pr_info("cgroup: fork rejected by pids controller in ");
 232                        pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
 233                        pr_cont("\n");
 234                }
 235                cgroup_file_notify(&pids->events_file);
 236        }
 237        return err;
 238}
 239
 240static void pids_cancel_fork(struct task_struct *task)
 241{
 242        struct cgroup_subsys_state *css;
 243        struct pids_cgroup *pids;
 244
 245        css = task_css_check(current, pids_cgrp_id, true);
 246        pids = css_pids(css);
 247        pids_uncharge(pids, 1);
 248}
 249
 250static void pids_free(struct task_struct *task)
 251{
 252        struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
 253
 254        pids_uncharge(pids, 1);
 255}
 256
 257static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
 258                              size_t nbytes, loff_t off)
 259{
 260        struct cgroup_subsys_state *css = of_css(of);
 261        struct pids_cgroup *pids = css_pids(css);
 262        int64_t limit;
 263        int err;
 264
 265        buf = strstrip(buf);
 266        if (!strcmp(buf, PIDS_MAX_STR)) {
 267                limit = PIDS_MAX;
 268                goto set_limit;
 269        }
 270
 271        err = kstrtoll(buf, 0, &limit);
 272        if (err)
 273                return err;
 274
 275        if (limit < 0 || limit >= PIDS_MAX)
 276                return -EINVAL;
 277
 278set_limit:
 279        /*
 280         * Limit updates don't need to be mutex'd, since it isn't
 281         * critical that any racing fork()s follow the new limit.
 282         */
 283        pids->limit = limit;
 284        return nbytes;
 285}
 286
 287static int pids_max_show(struct seq_file *sf, void *v)
 288{
 289        struct cgroup_subsys_state *css = seq_css(sf);
 290        struct pids_cgroup *pids = css_pids(css);
 291        int64_t limit = pids->limit;
 292
 293        if (limit >= PIDS_MAX)
 294                seq_printf(sf, "%s\n", PIDS_MAX_STR);
 295        else
 296                seq_printf(sf, "%lld\n", limit);
 297
 298        return 0;
 299}
 300
 301static s64 pids_current_read(struct cgroup_subsys_state *css,
 302                             struct cftype *cft)
 303{
 304        struct pids_cgroup *pids = css_pids(css);
 305
 306        return atomic64_read(&pids->counter);
 307}
 308
 309static int pids_events_show(struct seq_file *sf, void *v)
 310{
 311        struct pids_cgroup *pids = css_pids(seq_css(sf));
 312
 313        seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
 314        return 0;
 315}
 316
 317static struct cftype pids_files[] = {
 318        {
 319                .name = "max",
 320                .write = pids_max_write,
 321                .seq_show = pids_max_show,
 322                .flags = CFTYPE_NOT_ON_ROOT,
 323        },
 324        {
 325                .name = "current",
 326                .read_s64 = pids_current_read,
 327                .flags = CFTYPE_NOT_ON_ROOT,
 328        },
 329        {
 330                .name = "events",
 331                .seq_show = pids_events_show,
 332                .file_offset = offsetof(struct pids_cgroup, events_file),
 333                .flags = CFTYPE_NOT_ON_ROOT,
 334        },
 335        { }     /* terminate */
 336};
 337
 338struct cgroup_subsys pids_cgrp_subsys = {
 339        .css_alloc      = pids_css_alloc,
 340        .css_free       = pids_css_free,
 341        .can_attach     = pids_can_attach,
 342        .cancel_attach  = pids_cancel_attach,
 343        .can_fork       = pids_can_fork,
 344        .cancel_fork    = pids_cancel_fork,
 345        .free           = pids_free,
 346        .legacy_cftypes = pids_files,
 347        .dfl_cftypes    = pids_files,
 348};
 349