找回密码
 用户注册

QQ登录

只需一步,快速开始

查看: 4016|回复: 0

谈谈守护进程与僵尸进程

[复制链接]
发表于 2011-12-31 19:37:11 | 显示全部楼层 |阅读模式
04年时维护的第一个商业服务就用了两次fork产生守护进程的做法,前两天在网上看到许多帖子以及一些unix书籍,认为一次fork后产生守护进程足够了,各有道理吧,不过多了一次fork到底是出于什么目的呢?

进程也就是task,看看内核里维护进程的数据结构task_struct,这里有两个成员:
  1. struct task_struct {
  2.         volatile long state;
  3.         int exit_state;
  4.         ...
  5. }
复制代码
看看include/linux/sched.h里的value取值:
  1. #define TASK_RUNNING                0
  2. #define TASK_INTERRUPTIBLE        1
  3. #define TASK_UNINTERRUPTIBLE        2
  4. #define __TASK_STOPPED                4
  5. #define __TASK_TRACED                8
  6. /* in tsk->exit_state */
  7. #define EXIT_ZOMBIE                16
  8. #define EXIT_DEAD                32
  9. /* in tsk->state again */
  10. #define TASK_DEAD                64
  11. #define TASK_WAKEKILL                128
  12. #define TASK_WAKING                256
  13. #define TASK_STATE_MAX                512
复制代码
可以看到,进程状态里除了大家都理解的running/interuptible/uninterruptible/stop等状态外,还有一个ZOMBIE状态,这个状态是怎么回事呢?


这是因为linux里的进程都属于一颗树,树的根结点是linux系统初始化结束阶段时启动的init进程,这个进程的pid是1,所有的其他进程都是它的子孙。除了init,任何进程一定有他的父进程,而父进程会负责分配(fork)、回收(wait4)它申请的进程资源。这个树状关系也比较健壮,当某个进程还在运行时,它的父进程却退出了,这个进程却没有成为孤儿进程,因为linux有一个机制,init进程会接管它,成为它的父进程。这也是守护进程的由来了,因为守护进程的其中一个要求就是希望init成为守护进程的父进程。

如果某个进程自身终止了,在调用exit清理完相关的内容文件等资源后,它就会进入ZOMBIE状态,它的父进程会调用wait4来回收这个task_struct,但是,如果父进程一直没有调用wait4去释放子进程的task_struct,问题就来了,这个task_struct谁来回收呢?永远没有人,除非父进程终止后,被init进程接管这个ZOMBIE进程,然后调用wait4来回收进程描述符。如果父进程一直在运行着,这个ZOMBIE会永远的占用系统资源,用KILL发任何信号量也不能释放它。这是很可怕的,因为服务器上可能会出现无数ZOMBIE进程导致机器挂掉。

来看看内核代码吧。进程在退出时执行sys_exit(C程序里在main函数返回会执行到),而它会调用do_exit,do_exit首先清理进程使用的资源,然后调用exit_notify方法,将进程置为僵尸ZOMBIE状态,决定是否要以init进程做为当前进程的父进程,最后通知当前进程的父进程:
kernel/exit.c
  1. static void exit_notify(struct task_struct *tsk)
  2. {
  3.         int state;
  4.         struct task_struct *t;
  5.         struct list_head ptrace_dead, *_p, *_n;
  6.         if (signal_pending(tsk) && !tsk->signal->group_exit
  7.             && !thread_group_empty(tsk)) {
  8.                 /*
  9.                  * This occurs when there was a race between our exit
  10.                  * syscall and a group signal choosing us as the one to
  11.                  * wake up.  It could be that we are the only thread
  12.                  * alerted to check for pending signals, but another thread
  13.                  * should be woken now to take the signal since we will not.
  14.                  * Now we'll wake all the threads in the group just to make
  15.                  * sure someone gets all the pending signals.
  16.                  */
  17.                 read_lock(&tasklist_lock);
  18.                 spin_lock_irq(&tsk->sighand->siglock);
  19.                 for (t = next_thread(tsk); t != tsk; t = next_thread(t))
  20.                         if (!signal_pending(t) && !(t->flags & PF_EXITING)) {
  21.                                 recalc_sigpending_tsk(t);
  22.                                 if (signal_pending(t))
  23.                                         signal_wake_up(t, 0);
  24.                         }
  25.                 spin_unlock_irq(&tsk->sighand->siglock);
  26.                 read_unlock(&tasklist_lock);
  27.         }
  28.         write_lock_irq(&tasklist_lock);
  29.         /*
  30.          * This does two things:
  31.          *
  32.            * A.  Make init inherit all the child processes
  33.          * B.  Check to see if any process groups have become orphaned
  34.          *        as a result of our exiting, and if they have any stopped
  35.          *        jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
  36.          */
  37.         INIT_LIST_HEAD(&ptrace_dead);
  38.         forget_original_parent(tsk, &ptrace_dead);
  39.         BUG_ON(!list_empty(&tsk->children));
  40.         BUG_ON(!list_empty(&tsk->ptrace_children));
  41.         /*
  42.          * Check to see if any process groups have become orphaned
  43.          * as a result of our exiting, and if they have any stopped
  44.          * jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
  45.          *
  46.          * Case i: Our father is in a different pgrp than we are
  47.          * and we were the only connection outside, so our pgrp
  48.          * is about to become orphaned.
  49.          */
  50.          
  51.         t = tsk->real_parent;
  52.        
  53.         if ((process_group(t) != process_group(tsk)) &&
  54.             (t->signal->session == tsk->signal->session) &&
  55.             will_become_orphaned_pgrp(process_group(tsk), tsk) &&
  56.             has_stopped_jobs(process_group(tsk))) {
  57.                 __kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
  58.                 __kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
  59.         }
  60.         /* Let father know we died
  61.          *
  62.          * Thread signals are configurable, but you aren't going to use
  63.          * that to send signals to arbitary processes.
  64.          * That stops right now.
  65.          *
  66.          * If the parent exec id doesn't match the exec id we saved
  67.          * when we started then we know the parent has changed security
  68.          * domain.
  69.          *
  70.          * If our self_exec id doesn't match our parent_exec_id then
  71.          * we have changed execution domain as these two values started
  72.          * the same after a fork.
  73.          *       
  74.          */
  75.        
  76.         if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
  77.             ( tsk->parent_exec_id != t->self_exec_id  ||
  78.               tsk->self_exec_id != tsk->parent_exec_id)
  79.             && !capable(CAP_KILL))
  80.                 tsk->exit_signal = SIGCHLD;
  81.         /* If something other than our normal parent is ptracing us, then
  82.          * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
  83.          * only has special meaning to our real parent.
  84.          */
  85.         if (tsk->exit_signal != -1 && thread_group_empty(tsk)) {
  86.                 int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD;
  87.                 do_notify_parent(tsk, signal);
  88.         } else if (tsk->ptrace) {
  89.                 do_notify_parent(tsk, SIGCHLD);
  90.         }
  91.         state = EXIT_ZOMBIE;
  92.         if (tsk->exit_signal == -1 && tsk->ptrace == 0)
  93.                 state = EXIT_DEAD;
  94.         tsk->exit_state = state;
  95.         /*
  96.          * Clear these here so that update_process_times() won't try to deliver
  97.          * itimer, profile or rlimit signals to this task while it is in late exit.
  98.          */
  99.         tsk->it_virt_value = 0;
  100.         tsk->it_prof_value = 0;
  101.         write_unlock_irq(&tasklist_lock);
  102.         list_for_each_safe(_p, _n, &ptrace_dead) {
  103.                 list_del_init(_p);
  104.                 t = list_entry(_p,struct task_struct,ptrace_list);
  105.                 release_task(t);
  106.         }
  107.         /* If the process is dead, release it - nobody will wait for it */
  108.         if (state == EXIT_DEAD)
  109.                 release_task(tsk);
  110.         /* PF_DEAD causes final put_task_struct after we schedule. */
  111.         preempt_disable();
  112.         tsk->flags |= PF_DEAD;
  113. }
复制代码
大家可以看到这段内核代码的注释非常全。forget_original_parent这个函数还会把该进程的所有子孙进程重设父进程,交给init进程接管。


回过头来,看看为什么守护进程要fork两次。这里有一个假定,父进程生成守护进程后,还有自己的事要做,它的人生意义并不只是为了生成守护进程。这样,如果父进程fork一次创建了一个守护进程,然后继续做其它事时阻塞了,这时守护进程一直在运行,父进程却没有正常退出。如果守护进程因为正常或非正常原因退出了,就会变成ZOMBIE进程。
如果fork两次呢?父进程先fork出一个儿子进程,儿子进程再fork出孙子进程做为守护进程,然后儿子进程立刻退出,守护进程被init进程接管,这样无论父进程做什么事,无论怎么被阻塞,都与守护进程无关了。所以,fork两次的守护进程很安全,避免了僵尸进程出现的可能性。作者:russell_tao 发表于2011-12-21 11:00:13 原文链接
您需要登录后才可以回帖 登录 | 用户注册

本版积分规则

Archiver|手机版|小黑屋|ACE Developer ( 京ICP备06055248号 )

GMT+8, 2024-5-8 08:23 , Processed in 0.015134 second(s), 7 queries , Redis On.

Powered by Discuz! X3.5

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表