当前位置:   article > 正文

chrt: failed to set pid 0’s policy: Operation not permitted_chrt 无法更改策略

chrt 无法更改策略

1、问题现象

chrt设置RT不成功

  1. [root@test1:/root] chrt -f 99 whoami
  2. chrt: failed to set pid 0's policy: Operation not permitted

上述测试语句是将whoami这个命令进程的调度策略调整成fifo 99优先级执行,但是报错 了。这个错误直译过来是给pid 0 设置调度规则是不允许的,没有这个权限。

2、初步分析

先搜一下有没有类似问题:

Ubuntu – chrt(): “failed to set pid XXX’s policy” on one machine, but not others – iTecTec

[原创] chrt: failed to set pid xxxx's policy: Operation not permitted_Dream.Seeker的博客-CSDN博客

解决方式是:sysctl -w kernel.sched_rt_runtime_us=-1

这条命令意思是将rt任务的执行时间设置为不受限制,即只要有rt任务,那么这个任务会一直占据CPU直到运行结束或主动让权,这是个高危设置,容易造成hungtask或softlockup,其他任务得不到调度或直接系统卡死。

为什么这样设置后就可以让chrt 执行成功了呢?以及之前执行失败的真正原因是什么呢?搜索知识库已经没能找到答案了,正面分析一下。

3、寻找根因

strace一下系统调用:

  1. strace chrt -f 99 whoami
  2. execve("/usr/bin/chrt", ["chrt", "-f", "99", "whoami"], 0x7fff36a99428 /* 29 vars */) = 0
  3. brk(NULL) = 0x608000
  4. ......
  5. sched_get_priority_min(SCHED_FIFO) = 1
  6. sched_get_priority_max(SCHED_FIFO) = 99
  7. sched_setattr(0, {size=48, sched_policy=SCHED_FIFO, sched_flags=0, sched_nice=0, sched_priority=99, sched_runtime=0, sched_deadline=0, sched_period=0}, 0) = -1EPERM (Operation not permitted)
  8. write(2, "chrt: ", 6chrt: ) = 6
  9. write(2, "failed to set pid 0's policy", 28failed to set pid 0's policy) = 28
  10. write(2, ": ", 2: ) = 2
  11. ......
  12. +++ exited with 1 +++

发现是sched_setattr返回了EPERM错误(操作无权限),继续走读内核代码,函数调用关系:sched_setattr -> __sched_setscheduler

对应函数代码:

  1. static int __sched_setscheduler(struct task_struct *p,
  2. const struct sched_attr *attr,
  3. bool user, bool pi)
  4. {
  5. int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
  6. MAX_RT_PRIO - 1 - attr->sched_priority;
  7. int retval, oldprio, oldpolicy = -1, queued, running;
  8. int new_effective_prio, policy = attr->sched_policy;
  9. const struct sched_class *prev_class;
  10. struct rq_flags rf;
  11. int reset_on_fork;
  12. int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
  13. struct rq *rq;
  14. /* The pi code expects interrupts enabled */
  15. BUG_ON(pi && in_interrupt());
  16. recheck:
  17. /* Double check policy once rq lock held: */
  18. if (policy < 0) {
  19. reset_on_fork = p->sched_reset_on_fork;
  20. policy = oldpolicy = p->policy;
  21. } else {
  22. reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
  23. if (!valid_policy(policy))
  24. return -EINVAL;
  25. }
  26. if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
  27. return -EINVAL;
  28. /*
  29. * Valid priorities for SCHED_FIFO and SCHED_RR are
  30. * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
  31. * SCHED_BATCH and SCHED_IDLE is 0.
  32. */
  33. if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
  34. (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
  35. return -EINVAL;
  36. if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
  37. (rt_policy(policy) != (attr->sched_priority != 0)))
  38. return -EINVAL;
  39. /*
  40. * Allow unprivileged RT tasks to decrease priority:
  41. */
  42. if (user && !capable(CAP_SYS_NICE)) {
  43. if (fair_policy(policy)) {
  44. if (attr->sched_nice < task_nice(p) &&
  45. !can_nice(p, attr->sched_nice))
  46. return -EPERM;
  47. }
  48. if (rt_policy(policy)) {
  49. unsigned long rlim_rtprio =
  50. task_rlimit(p, RLIMIT_RTPRIO);
  51. /* Can't set/change the rt policy: */
  52. if (policy != p->policy && !rlim_rtprio)
  53. return -EPERM;
  54. /* Can't increase priority: */
  55. if (attr->sched_priority > p->rt_priority &&
  56. attr->sched_priority > rlim_rtprio)
  57. return -EPERM;
  58. }
  59. /*
  60. * Can't set/change SCHED_DEADLINE policy at all for now
  61. * (safest behavior); in the future we would like to allow
  62. * unprivileged DL tasks to increase their relative deadline
  63. * or reduce their runtime (both ways reducing utilization)
  64. */
  65. if (dl_policy(policy))
  66. return -EPERM;
  67. /*
  68. * Treat SCHED_IDLE as nice 20. Only allow a switch to
  69. * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
  70. */
  71. if (idle_policy(p->policy) && !idle_policy(policy)) {
  72. if (!can_nice(p, task_nice(p)))
  73. return -EPERM;
  74. }
  75. /* Can't change other user's priorities: */
  76. if (!check_same_owner(p))
  77. return -EPERM;
  78. /* Normal users shall not reset the sched_reset_on_fork flag: */
  79. if (p->sched_reset_on_fork && !reset_on_fork)
  80. return -EPERM;
  81. }
  82. if (user) {
  83. if (attr->sched_flags & SCHED_FLAG_SUGOV)
  84. return -EINVAL;
  85. retval = security_task_setscheduler(p);
  86. if (retval)
  87. return retval;
  88. }
  89. /*
  90. * Make sure no PI-waiters arrive (or leave) while we are
  91. * changing the priority of the task:
  92. *
  93. * To be able to change p->policy safely, the appropriate
  94. * runqueue lock must be held.
  95. */
  96. rq = task_rq_lock(p, &rf);
  97. update_rq_clock(rq);
  98. /*
  99. * Changing the policy of the stop threads its a very bad idea:
  100. */
  101. if (p == rq->stop) {
  102. task_rq_unlock(rq, p, &rf);
  103. return -EINVAL;
  104. }
  105. /*
  106. * If not changing anything there's no need to proceed further,
  107. * but store a possible modification of reset_on_fork.
  108. */
  109. if (unlikely(policy == p->policy)) {
  110. if (fair_policy(policy) && attr->sched_nice != task_nice(p))
  111. goto change;
  112. if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
  113. goto change;
  114. if (dl_policy(policy) && dl_param_changed(p, attr))
  115. goto change;
  116. p->sched_reset_on_fork = reset_on_fork;
  117. task_rq_unlock(rq, p, &rf);
  118. return 0;
  119. }
  120. change:
  121. if (user) {
  122. #ifdef CONFIG_RT_GROUP_SCHED
  123. /*
  124. * Do not allow realtime tasks into groups that have no runtime
  125. * assigned.
  126. */
  127. if (rt_bandwidth_enabled() && rt_policy(policy) &&
  128. task_group(p)->rt_bandwidth.rt_runtime == 0 &&
  129. !task_group_is_autogroup(task_group(p))) {
  130. task_rq_unlock(rq, p, &rf);
  131. return -EPERM;
  132. }
  133. #endif
  134. #ifdef CONFIG_SMP
  135. if (dl_bandwidth_enabled() && dl_policy(policy) &&
  136. !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
  137. cpumask_t *span = rq->rd->span;
  138. /*
  139. * Don't allow tasks with an affinity mask smaller than
  140. * the entire root_domain to become SCHED_DEADLINE. We
  141. * will also fail if there's no bandwidth available.
  142. */
  143. if (!cpumask_subset(span, &p->cpus_allowed) ||
  144. rq->rd->dl_bw.bw == 0) {
  145. task_rq_unlock(rq, p, &rf);
  146. return -EPERM;
  147. }
  148. }
  149. #endif
  150. }
  151. /* Re-check policy now with rq lock held: */
  152. if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
  153. policy = oldpolicy = -1;
  154. task_rq_unlock(rq, p, &rf);
  155. goto recheck;
  156. }
  157. /*
  158. * If setscheduling to SCHED_DEADLINE (or changing the parameters
  159. * of a SCHED_DEADLINE task) we need to check if enough bandwidth
  160. * is available.
  161. */
  162. if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
  163. task_rq_unlock(rq, p, &rf);
  164. return -EBUSY;
  165. }
  166. p->sched_reset_on_fork = reset_on_fork;
  167. oldprio = p->prio;
  168. if (pi) {
  169. /*
  170. * Take priority boosted tasks into account. If the new
  171. * effective priority is unchanged, we just store the new
  172. * normal parameters and do not touch the scheduler class and
  173. * the runqueue. This will be done when the task deboost
  174. * itself.
  175. */
  176. new_effective_prio = rt_effective_prio(p, newprio);
  177. if (new_effective_prio == oldprio)
  178. queue_flags &= ~DEQUEUE_MOVE;
  179. }
  180. queued = task_on_rq_queued(p);
  181. running = task_current(rq, p);
  182. if (queued)
  183. dequeue_task(rq, p, queue_flags);
  184. if (running)
  185. put_prev_task(rq, p);
  186. prev_class = p->sched_class;
  187. __setscheduler(rq, p, attr, pi);
  188. if (queued) {
  189. /*
  190. * We enqueue to tail when the priority of a task is
  191. * increased (user space view).
  192. */
  193. if (oldprio < p->prio)
  194. queue_flags |= ENQUEUE_HEAD;
  195. enqueue_task(rq, p, queue_flags);
  196. }
  197. if (running)
  198. set_curr_task(rq, p);
  199. check_class_changed(rq, p, prev_class, oldprio);
  200. /* Avoid rq from going away on us: */
  201. preempt_disable();
  202. task_rq_unlock(rq, p, &rf);
  203. if (pi)
  204. rt_mutex_adjust_pi(p);
  205. /* Run balance callbacks after we've adjusted the PI chain: */
  206. balance_callback(rq);
  207. preempt_enable();
  208. return 0;
  209. }

直接分析EPERM返回点,发现一处跟rt_runtime有关的判断分支:

  1. /*
  2. * Do not allow realtime tasks into groups that have no runtime
  3. * assigned.
  4. */
  5. if (rt_bandwidth_enabled() && rt_policy(policy) &&
  6. task_group(p)->rt_bandwidth.rt_runtime == 0 &&
  7. !task_group_is_autogroup(task_group(p))) {
  8. task_rq_unlock(rq, p, &rf);
  9. return -EPERM;
  10. }
  1. static inline int rt_bandwidth_enabled(void)
  2. {
  3. return sysctl_sched_rt_runtime >= 0;
  4. }
  1. {
  2. .procname = "sched_rt_runtime_us",
  3. .data = &sysctl_sched_rt_runtime,
  4. .maxlen = sizeof(int),
  5. .mode = 0644,
  6. .proc_handler = sched_rt_handler,
  7. }

这个判断的直接意思就是如果sched_rt_runtime_us设置了大于或等于0(使能了实时进程的运行时间限制),自动给分组的进程组的实时进程限制运行时间为0,那么 sched_setattr rt 的操作没有权限。

sysctl -w kernel.sched_rt_runtime_us=-1 是将/proc/sys/kernel/sched_rt_period_us设置为-1,即sysctl_sched_rt_runtime=-1,这个操作过后上述条件不满足了,实际上就是给了设置rt进程操作的权限了,同时也带来了系统隐患。

4、解决方案

真正的解决方案应该在这里:

if (rt_bandwidth_enabled() && rt_policy(policy) &&

task_group(p)->rt_bandwidth.rt_runtime == 0 &&

!task_group_is_autogroup(task_group(p)))

这个意思是进程组的实时进程限制运行时间为0,只要把它调整为大于0就可以了,系统默认的数值应该是950000 或 0.95s。这数值怎么调呢?

先看下新进程所在crgoup的rt_runtime_us配置,果然是0:

  1. [root@test1:/root]
  2. cat /sys/fs/cgroup/cpu/system.slice/cpu.rt_runtime_us
  3. 0

解决步骤

1、找到当前环境下新进程的cgroup。cat /proc/(进程pid)/cgroup

  1. [root@test1:/root]
  2. cat /proc/35216/cgroup
  3. 12:devices:/system.slice
  4. 11:cpuset:/
  5. 10:perf_event:/
  6. 9:freezer:/
  7. 8:memory:/system.slice
  8. 7:pids:/
  9. 6:blkio:/
  10. 5:cpu,cpuacct:/system.slice
  11. 4:net_cls,net_prio:/
  12. 3:rdma:/
  13. 2:hugetlb:/
  14. 1:name=systemd:/system.slice

2、进入到对应的cgroup修改cpu.rt_runtime_us为950000(或其他大于0的值,取决于你的rt策略)。echo 950000 > /sys/fs/cgroup/cpu/(对应进程的croup)/cpu.rt_runtime_us

  1. [root@test1:/root]
  2. echo 950000 > /sys/fs/cgroup/cpu/system.slice/cpu.rt_runtime_us

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/329823
推荐阅读
相关标签
  

闽ICP备14008679号