Bugtraq mailing list archives

Re: [patch] /proc race fixes for 2.2.1 (fwd)


From: andrea () e-mind com (Andrea Arcangeli)
Date: Thu, 4 Feb 1999 15:09:06 +0100


On Tue, 2 Feb 1999, Andrea Arcangeli wrote:

Side note: I hope to have diffed all the interesting changes from my tree
to 2.2.1 at the end of the email (I don't have the time to check). If for

Woops I had a bug in the patch. The bug is that when the task to grab is
the current one we must not get the mmap_semaphore otherwise we left a
window open for deadlocking on it.

Here the fixed patch against 2.2.1 again.

Index: array.c
===================================================================
RCS file: /var/cvs/linux/fs/proc/array.c,v
retrieving revision 1.1.1.5
diff -u -r1.1.1.5 array.c
--- array.c     1999/01/29 14:50:53     1.1.1.5
+++ linux/fs/proc/array.c       1999/02/04 13:59:26
@@ -386,21 +386,57 @@
        return sprintf(buffer, "%s\n", saved_command_line);
 }

-static unsigned long get_phys_addr(struct task_struct * p, unsigned long ptr)
+/*
+ * Caller must release_mm the mm_struct later.
+ * You don't get any access to init_mm.
+ */
+static struct mm_struct * grab_mm(int pid)
+{
+       struct mm_struct * mm;
+       struct task_struct * tsk;
+
+       if (current->pid == pid)
+               return current->mm;
+
+       mm = NULL;
+       read_lock(&tasklist_lock);
+       tsk = find_task_by_pid(pid);
+       /*
+        * NOTE: this doesn't race because we are protected by the
+        * big kernel lock. -arca
+        */
+       if (tsk && tsk->mm && tsk->mm != &init_mm)
+               mmget(mm = tsk->mm);
+       read_unlock(&tasklist_lock);
+       if (mm)
+               down(&mm->mmap_sem);
+       return mm;
+}
+
+static void release_mm(struct mm_struct *mm)
 {
+       if (current->mm != mm)
+       {
+               up(&mm->mmap_sem);
+               mmput(mm);
+       }
+}
+
+static unsigned long get_phys_addr(struct mm_struct *mm, unsigned long ptr)
+{
        pgd_t *page_dir;
        pmd_t *page_middle;
        pte_t pte;

-       if (!p || !p->mm || ptr >= TASK_SIZE)
+       if (ptr >= TASK_SIZE)
                return 0;
        /* Check for NULL pgd .. shouldn't happen! */
-       if (!p->mm->pgd) {
-               printk("get_phys_addr: pid %d has NULL pgd!\n", p->pid);
+       if (!mm->pgd) {
+               printk(KERN_DEBUG "missing pgd for mm %p\n", mm);
                return 0;
        }

-       page_dir = pgd_offset(p->mm,ptr);
+       page_dir = pgd_offset(mm,ptr);
        if (pgd_none(*page_dir))
                return 0;
        if (pgd_bad(*page_dir)) {
@@ -422,7 +458,7 @@
        return pte_page(pte) + (ptr & ~PAGE_MASK);
 }

-static int get_array(struct task_struct *p, unsigned long start, unsigned long end, char * buffer)
+static int get_array(struct mm_struct *mm, unsigned long start, unsigned long end, char * buffer)
 {
        unsigned long addr;
        int size = 0, result = 0;
@@ -431,7 +467,7 @@
        if (start >= end)
                return result;
        for (;;) {
-               addr = get_phys_addr(p, start);
+               addr = get_phys_addr(mm, start);
                if (!addr)
                        return result;
                do {
@@ -453,27 +489,28 @@

 static int get_env(int pid, char * buffer)
 {
-       struct task_struct *p;
-       
-       read_lock(&tasklist_lock);
-       p = find_task_by_pid(pid);
-       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
+       struct mm_struct *mm;
+       int res = 0;

-       if (!p || !p->mm)
-               return 0;
-       return get_array(p, p->mm->env_start, p->mm->env_end, buffer);
+       mm = grab_mm(pid);
+       if (mm) {
+               res = get_array(mm, mm->env_start, mm->env_end, buffer);
+               release_mm(mm);
+       }
+       return res;
 }

 static int get_arg(int pid, char * buffer)
 {
-       struct task_struct *p;
+       struct mm_struct *mm;
+       int res = 0;

-       read_lock(&tasklist_lock);
-       p = find_task_by_pid(pid);
-       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
-       if (!p || !p->mm)
-               return 0;
-       return get_array(p, p->mm->arg_start, p->mm->arg_end, buffer);
+       mm = grab_mm(pid);
+       if (mm) {
+               res = get_array(mm, mm->arg_start, mm->arg_end, buffer);
+               release_mm(mm);
+       }
+       return res;
 }

 /*
@@ -722,12 +759,10 @@
        return buffer;
 }

-static inline char * task_mem(struct task_struct *p, char *buffer)
+static inline char * task_mem(struct mm_struct * mm, char *buffer)
 {
-       struct mm_struct * mm = p->mm;
-
-       if (mm && mm != &init_mm) {
-               struct vm_area_struct * vma = mm->mmap;
+       if (mm) {
+               struct vm_area_struct * vma;
                unsigned long data = 0, stack = 0;
                unsigned long exec = 0, lib = 0;

@@ -817,47 +852,96 @@
                            cap_t(p->cap_effective));
 }

+static struct task_struct *grab_task(int pid, struct mm_struct ** mm)
+{
+       struct task_struct *tsk = current;
+       
+       if (current->pid == pid)
+       {
+               *mm = current->mm;
+               return current;
+       }
+
+       *mm = NULL;
+       read_lock(&tasklist_lock);
+       tsk = find_task_by_pid(pid);
+       if (tsk)
+       {
+               struct mm_struct * __mm;
+               struct page * page = mem_map + MAP_NR(tsk);
+               atomic_inc(&page->count);
+               /*
+                * NOTE: this doesn't race because we are protected
+                * by the big kernel lock. -arca
+                */
+               __mm = tsk->mm;
+               if (__mm && __mm != &init_mm)
+               {
+                       mmget(__mm);
+                       *mm = __mm;
+               }
+       }
+       read_unlock(&tasklist_lock);
+       if (*mm)
+               down(&(*mm)->mmap_sem);
+
+       return tsk;
+}
+
+static void release_task(struct task_struct *tsk, struct mm_struct * mm)
+{
+       if (current != tsk)
+       {
+               if (mm)
+               {
+                       up(&mm->mmap_sem);
+                       mmput(mm);
+               }
+               free_pages((unsigned long) tsk, 1);
+       }
+}

 static int get_status(int pid, char * buffer)
 {
        char * orig = buffer;
        struct task_struct *tsk;
-
-       read_lock(&tasklist_lock);
-       tsk = find_task_by_pid(pid);
-       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
+       struct mm_struct * mm;
+       
+       tsk = grab_task(pid, &mm);
        if (!tsk)
                return 0;
        buffer = task_name(tsk, buffer);
        buffer = task_state(tsk, buffer);
-       buffer = task_mem(tsk, buffer);
+       buffer = task_mem(mm, buffer);
        buffer = task_sig(tsk, buffer);
        buffer = task_cap(tsk, buffer);
+       release_task(tsk, mm);
        return buffer - orig;
 }

 static int get_stat(int pid, char * buffer)
 {
        struct task_struct *tsk;
+       struct mm_struct * mm;
        unsigned long vsize, eip, esp, wchan;
        long priority, nice;
        int tty_pgrp;
        sigset_t sigign, sigcatch;
        char state;
+       int res;

-       read_lock(&tasklist_lock);
-       tsk = find_task_by_pid(pid);
-       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
+       tsk = grab_task(pid, &mm);
        if (!tsk)
                return 0;
        state = *get_task_state(tsk);
        vsize = eip = esp = 0;
-       if (tsk->mm && tsk->mm != &init_mm) {
-               struct vm_area_struct *vma = tsk->mm->mmap;
-               while (vma) {
+       if (mm) {
+               struct vm_area_struct *vma;
+
+               for (vma = mm->mmap; vma; vma = vma->vm_next) {
                        vsize += vma->vm_end - vma->vm_start;
-                       vma = vma->vm_next;
                }
+               
                eip = KSTK_EIP(tsk);
                esp = KSTK_ESP(tsk);
        }
@@ -878,7 +962,7 @@
        nice = tsk->priority;
        nice = 20 - (nice * 20 + DEF_PRIORITY / 2) / DEF_PRIORITY;

-       return sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
+       res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
 %lu %lu %lu %lu %lu %lu %lu %lu %d\n",
                pid,
@@ -904,11 +988,11 @@
                tsk->it_real_value,
                tsk->start_time,
                vsize,
-               tsk->mm ? tsk->mm->rss : 0, /* you might want to shift this left 3 */
+               mm ? mm->rss : 0, /* you might want to shift this left 3 */
                tsk->rlim ? tsk->rlim[RLIMIT_RSS].rlim_cur : 0,
-               tsk->mm ? tsk->mm->start_code : 0,
-               tsk->mm ? tsk->mm->end_code : 0,
-               tsk->mm ? tsk->mm->start_stack : 0,
+               mm ? mm->start_code : 0,
+               mm ? mm->end_code : 0,
+               mm ? mm->start_stack : 0,
                esp,
                eip,
                /* The signal information here is obsolete.
@@ -923,6 +1007,9 @@
                tsk->nswap,
                tsk->cnswap,
                tsk->exit_signal);
+
+       release_task(tsk, mm);
+       return res;
 }
                
 static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
@@ -1000,19 +1087,15 @@

 static int get_statm(int pid, char * buffer)
 {
-       struct task_struct *tsk;
        int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0;
+       struct mm_struct *mm;

-       read_lock(&tasklist_lock);
-       tsk = find_task_by_pid(pid);
-       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
-       if (!tsk)
-               return 0;
-       if (tsk->mm && tsk->mm != &init_mm) {
-               struct vm_area_struct * vma = tsk->mm->mmap;
+       mm = grab_mm(pid);
+       if (mm) {
+               struct vm_area_struct * vma = mm->mmap;

                while (vma) {
-                       pgd_t *pgd = pgd_offset(tsk->mm, vma->vm_start);
+                       pgd_t *pgd = pgd_offset(mm, vma->vm_start);
                        int pages = 0, shared = 0, dirty = 0, total = 0;

                        statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
@@ -1030,6 +1113,7 @@
                                drs += pages;
                        vma = vma->vm_next;
                }
+               release_mm(mm);
        }
        return sprintf(buffer,"%d %d %d %d %d %d %d\n",
                       size, resident, share, trs, lrs, drs, dt);
@@ -1067,16 +1151,15 @@

 #define MAPS_LINE_MAX  MAPS_LINE_MAX8

-
 static ssize_t read_maps (int pid, struct file * file, char * buf,
                          size_t count, loff_t *ppos)
 {
-       struct task_struct *p;
+       struct task_struct * p;
+       struct mm_struct * mm;
        struct vm_area_struct * map, * next;
        char * destptr = buf, * buffer;
        loff_t lineno;
        ssize_t column, i;
-       int volatile_task;
        long retval;

        /*
@@ -1088,24 +1171,19 @@
                goto out;

        retval = -EINVAL;
-       read_lock(&tasklist_lock);
-       p = find_task_by_pid(pid);
-       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */
+       p = grab_task(pid, &mm);
        if (!p)
                goto freepage_out;

-       if (!p->mm || p->mm == &init_mm || count == 0)
+       if (!mm || count == 0)
                goto getlen_out;

-       /* Check whether the mmaps could change if we sleep */
-       volatile_task = (p != current || atomic_read(&p->mm->count) > 1);
-
        /* decode f_pos */
        lineno = *ppos >> MAPS_LINE_SHIFT;
        column = *ppos & (MAPS_LINE_LENGTH-1);

        /* quickly go to line lineno */
-       for (map = p->mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
+       for (map = mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
                continue;

        for ( ; map ; map = next ) {
@@ -1176,18 +1254,13 @@
                /* done? */
                if (count == 0)
                        break;
-
-               /* By writing to user space, we might have slept.
-                * Stop the loop, to avoid a race condition.
-                */
-               if (volatile_task)
-                       break;
        }

        /* encode f_pos */
        *ppos = (lineno << MAPS_LINE_SHIFT) + column;

 getlen_out:
+       release_task(p, mm);
        retval = destptr - buf;

 freepage_out:
@@ -1199,15 +1272,12 @@
 #ifdef __SMP__
 static int get_pidcpu(int pid, char * buffer)
 {
-       struct task_struct * tsk = current ;
+       struct task_struct * tsk;
+       struct mm_struct * mm;
        int i, len;
-
-       read_lock(&tasklist_lock);
-       if (pid != tsk->pid)
-               tsk = find_task_by_pid(pid);
-       read_unlock(&tasklist_lock);    /* FIXME!! This should be done after the last use */

-       if (tsk == NULL)
+       tsk = grab_task(pid, &mm);
+       if (!tsk)
                return 0;

        len = sprintf(buffer,
@@ -1221,6 +1291,7 @@
                        tsk->per_cpu_utime[cpu_logical_map(i)],
                        tsk->per_cpu_stime[cpu_logical_map(i)]);

+       release_task(tsk, mm);
        return len;
 }
 #endif


Excuse me for the mistake. I noticed the bug only some mintues ago.

Andrea Arcangeli



Current thread: