赞
踩
chrt设置RT不成功
- [root@test1:/root] chrt -f 99 whoami
- chrt: failed to set pid 0's policy: Operation not permitted
上述测试语句是将whoami这个命令进程的调度策略调整成fifo 99优先级执行,但是报错 了。这个错误直译过来是给pid 0 设置调度规则是不允许的,没有这个权限。
先搜一下有没有类似问题:
Ubuntu – chrt(): “failed to set pid XXX’s policy” on one machine, but not others – iTecTec
[原创] chrt: failed to set pid xxxx's policy: Operation not permitted_Dream.Seeker的博客-CSDN博客
解决方式是:sysctl -w kernel.sched_rt_runtime_us=-1
这条命令意思是将rt任务的执行时间设置为不受限制,即只要有rt任务,那么这个任务会一直占据CPU直到运行结束或主动让权,这是个高危设置,容易造成hungtask或softlockup,其他任务得不到调度或直接系统卡死。
为什么这样设置后就可以让chrt 执行成功了呢?以及之前执行失败的真正原因是什么呢?搜索知识库已经没能找到答案了,正面分析一下。
strace一下系统调用:
- strace chrt -f 99 whoami
- execve("/usr/bin/chrt", ["chrt", "-f", "99", "whoami"], 0x7fff36a99428 /* 29 vars */) = 0
- brk(NULL) = 0x608000
- ......
- sched_get_priority_min(SCHED_FIFO) = 1
- sched_get_priority_max(SCHED_FIFO) = 99
- sched_setattr(0, {size=48, sched_policy=SCHED_FIFO, sched_flags=0, sched_nice=0, sched_priority=99, sched_runtime=0, sched_deadline=0, sched_period=0}, 0) = -1EPERM (Operation not permitted)
- write(2, "chrt: ", 6chrt: ) = 6
- write(2, "failed to set pid 0's policy", 28failed to set pid 0's policy) = 28
- write(2, ": ", 2: ) = 2
- ......
- +++ exited with 1 +++
发现是sched_setattr返回了EPERM错误(操作无权限),继续走读内核代码,函数调用关系:sched_setattr -> __sched_setscheduler
对应函数代码:
- static int __sched_setscheduler(struct task_struct *p,
- const struct sched_attr *attr,
- bool user, bool pi)
- {
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
- const struct sched_class *prev_class;
- struct rq_flags rf;
- int reset_on_fork;
- int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct rq *rq;
-
- /* The pi code expects interrupts enabled */
- BUG_ON(pi && in_interrupt());
- recheck:
- /* Double check policy once rq lock held: */
- if (policy < 0) {
- reset_on_fork = p->sched_reset_on_fork;
- policy = oldpolicy = p->policy;
- } else {
- reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
-
- if (!valid_policy(policy))
- return -EINVAL;
- }
-
- if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
- return -EINVAL;
-
- /*
- * Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
- * SCHED_BATCH and SCHED_IDLE is 0.
- */
- if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
- return -EINVAL;
- if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
- (rt_policy(policy) != (attr->sched_priority != 0)))
- return -EINVAL;
-
- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (idle_policy(p->policy) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
- if (user) {
- if (attr->sched_flags & SCHED_FLAG_SUGOV)
- return -EINVAL;
-
- retval = security_task_setscheduler(p);
- if (retval)
- return retval;
- }
-
- /*
- * Make sure no PI-waiters arrive (or leave) while we are
- * changing the priority of the task:
- *
- * To be able to change p->policy safely, the appropriate
- * runqueue lock must be held.
- */
- rq = task_rq_lock(p, &rf);
- update_rq_clock(rq);
-
- /*
- * Changing the policy of the stop threads its a very bad idea:
- */
- if (p == rq->stop) {
- task_rq_unlock(rq, p, &rf);
- return -EINVAL;
- }
-
- /*
- * If not changing anything there's no need to proceed further,
- * but store a possible modification of reset_on_fork.
- */
- if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
- goto change;
- if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
- goto change;
- if (dl_policy(policy) && dl_param_changed(p, attr))
- goto change;
-
- p->sched_reset_on_fork = reset_on_fork;
- task_rq_unlock(rq, p, &rf);
- return 0;
- }
- change:
-
- if (user) {
- #ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Do not allow realtime tasks into groups that have no runtime
- * assigned.
- */
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0 &&
- !task_group_is_autogroup(task_group(p))) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
- }
- #endif
- #ifdef CONFIG_SMP
- if (dl_bandwidth_enabled() && dl_policy(policy) &&
- !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
- cpumask_t *span = rq->rd->span;
-
- /*
- * Don't allow tasks with an affinity mask smaller than
- * the entire root_domain to become SCHED_DEADLINE. We
- * will also fail if there's no bandwidth available.
- */
- if (!cpumask_subset(span, &p->cpus_allowed) ||
- rq->rd->dl_bw.bw == 0) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
- }
- }
- #endif
- }
-
- /* Re-check policy now with rq lock held: */
- if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
- policy = oldpolicy = -1;
- task_rq_unlock(rq, p, &rf);
- goto recheck;
- }
-
- /*
- * If setscheduling to SCHED_DEADLINE (or changing the parameters
- * of a SCHED_DEADLINE task) we need to check if enough bandwidth
- * is available.
- */
- if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- task_rq_unlock(rq, p, &rf);
- return -EBUSY;
- }
-
- p->sched_reset_on_fork = reset_on_fork;
- oldprio = p->prio;
-
- if (pi) {
- /*
- * Take priority boosted tasks into account. If the new
- * effective priority is unchanged, we just store the new
- * normal parameters and do not touch the scheduler class and
- * the runqueue. This will be done when the task deboost
- * itself.
- */
- new_effective_prio = rt_effective_prio(p, newprio);
- if (new_effective_prio == oldprio)
- queue_flags &= ~DEQUEUE_MOVE;
- }
-
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flags);
- if (running)
- put_prev_task(rq, p);
-
- prev_class = p->sched_class;
- __setscheduler(rq, p, attr, pi);
-
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- if (oldprio < p->prio)
- queue_flags |= ENQUEUE_HEAD;
-
- enqueue_task(rq, p, queue_flags);
- }
- if (running)
- set_curr_task(rq, p);
-
- check_class_changed(rq, p, prev_class, oldprio);
-
- /* Avoid rq from going away on us: */
- preempt_disable();
- task_rq_unlock(rq, p, &rf);
-
- if (pi)
- rt_mutex_adjust_pi(p);
-
- /* Run balance callbacks after we've adjusted the PI chain: */
- balance_callback(rq);
- preempt_enable();
-
- return 0;
- }

直接分析EPERM返回点,发现一处跟rt_runtime有关的判断分支:
- /*
- * Do not allow realtime tasks into groups that have no runtime
- * assigned.
- */
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0 &&
- !task_group_is_autogroup(task_group(p))) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
- }
- static inline int rt_bandwidth_enabled(void)
- {
- return sysctl_sched_rt_runtime >= 0;
- }
- {
- .procname = "sched_rt_runtime_us",
- .data = &sysctl_sched_rt_runtime,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sched_rt_handler,
- }
这个判断的直接意思就是如果sched_rt_runtime_us设置了大于或等于0(使能了实时进程的运行时间限制),自动给分组的进程组的实时进程限制运行时间为0,那么 sched_setattr rt 的操作没有权限。
sysctl -w kernel.sched_rt_runtime_us=-1 是将/proc/sys/kernel/sched_rt_period_us设置为-1,即sysctl_sched_rt_runtime=-1,这个操作过后上述条件不满足了,实际上就是给了设置rt进程操作的权限了,同时也带来了系统隐患。
真正的解决方案应该在这里:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p)))
这个意思是进程组的实时进程限制运行时间为0,只要把它调整为大于0就可以了,系统默认的数值应该是950000 或 0.95s。这数值怎么调呢?
先看下新进程所在crgoup的rt_runtime_us配置,果然是0:
- [root@test1:/root]
- cat /sys/fs/cgroup/cpu/system.slice/cpu.rt_runtime_us
- 0
解决步骤
1、找到当前环境下新进程的cgroup。cat /proc/(进程pid)/cgroup
- [root@test1:/root]
- cat /proc/35216/cgroup
- 12:devices:/system.slice
- 11:cpuset:/
- 10:perf_event:/
- 9:freezer:/
- 8:memory:/system.slice
- 7:pids:/
- 6:blkio:/
- 5:cpu,cpuacct:/system.slice
- 4:net_cls,net_prio:/
- 3:rdma:/
- 2:hugetlb:/
- 1:name=systemd:/system.slice
2、进入到对应的cgroup修改cpu.rt_runtime_us为950000(或其他大于0的值,取决于你的rt策略)。echo 950000 > /sys/fs/cgroup/cpu/(对应进程的croup)/cpu.rt_runtime_us
- [root@test1:/root]
- echo 950000 > /sys/fs/cgroup/cpu/system.slice/cpu.rt_runtime_us
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。