First Attempt at Patching Kernel
  A Simple Fix of VMA Merging Issue


       Caspar Zhang @ linuxfb

         caspar@casparzhang.com


         September 19, 2011
Agenda


  Background


  Issue Spotted


  Analysis


  Patchwork




                  First Attempt at Patching Kernel   2/19
Background
     Glossary: VMA
     < linux/mm types.h >: struct vm area struct
     $ cat/proc/ < pid > /maps
     mbind(): Set NUMA policy for a memory range
     Glossary: NUMA




                       First Attempt at Patching Kernel   3/19
Issue Spotted

  An upstream commit with reproducer.
  commit 9d8cebd4bcd7c3878462fdfda34bbcdeb4df7ef4
  Author: KOSAKI Motohiro < kosaki.motohiro@jp. f u jitsu.com >
  Date: Fri Mar 5 13:41:57 2010 -0800

        mm: fix mbind vma merge problem

     Strangely, current mbind() doesn’t merge vma with neighbor vma
  although it’s possible.
     Unfortunately, many vma can reduce performance...

        This patch fixes it.

        reproduced program
  ...


                              First Attempt at Patching Kernel        4/19
Reproducer

 1          addr = mmap(NULL, pagesize*3, PROT_READ|PROT_WRITE,
 2                  MAP_ANON|MAP_PRIVATE, 0, 0);
 3          if (addr == MAP_FAILED)
 4              perror("mmap "), exit(1);
 5
 6          /* make page populate */
 7          memset(addr, 0, pagesize*3);
 8
 9          /* first mbind */
10          err = mbind(addr+pagesize, pagesize, MPOL_BIND, nmask->maskp,
11                  nmask->size, MPOL_MF_MOVE_ALL);
12
13          /* second mbind */
14          err = mbind(addr, pagesize*3, MPOL_DEFAULT, NULL, 0, 0);


     mmap:               |==========================|
     mbind1: ...|--------|========|oooooooo|========|--------|...
     mbind2: ...|--------|========|========|========|--------|...
                A        B        C        D        E        F



                                First Attempt at Patching Kernel            5/19
Issue Spotted (cont.)

   An upstream commit with reproducer(cont.)
   result without this patch
   addr = 0x7fe26ef09000
   [snip]
   7fe26ef09000-7fe26ef0a000 rw-p 00000000 00:00 0
   7fe26ef0a000-7fe26ef0b000 rw-p 00000000 00:00 0
   7fe26ef0b000-7fe26ef0c000 rw-p 00000000 00:00 0
   7fe26ef0c000-7fe26ef0d000 rw-p 00000000 00:00 0
   => 0x7fe26ef09000-0x7fe26ef0c000 have three vmas.

   result with this patch
   addr = 0x7fc9ebc76000
   [snip]
   7fc9ebc76000-7fc9ebc7a000 rw-p 00000000 00:00 0
   7fffbe690000-7fffbe6a5000 rw-p 00000000 00:00 0 [stack]
   => 0x7fc9ebc76000-0x7fc9ebc7a000 have only one vma.

                        First Attempt at Patching Kernel     6/19
Issue Spotted (cont.)
      port the reproducer to LTP — not work
      fix bug in LTP — still not work
      suspect Kernel bug




                        First Attempt at Patching Kernel   7/19
Analysis - mbind range()

 1   /* Step 2: apply policy to a range and do splits. */
 2   static int mbind_range(struct mm_struct *mm, unsigned long start,
 3                  unsigned long end, struct mempolicy *new_pol)
 4   {
 5       struct vm_area_struct *next;
 6       struct vm_area_struct *prev;
 7       struct vm_area_struct *vma;
 8       int err = 0;
 9       pgoff_t pgoff;
10       unsigned long vmstart;
11       unsigned long vmend;
12
13       vma = find_vma_prev(mm, start, &prev);
14       if (!vma || vma->vm_start > start)
15           return -EFAULT;


               start                       end
     ...|--------|========|========|========|--------|...
        A prev B vma      C        D        E        F
             vma->start
                                 First Attempt at Patching Kernel        8/19
Analysis - loop

 1     for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 2         next = vma->vm_next;
 3         vmstart = max(start, vma->vm_start);
 4         vmend = min(end, vma->vm_end);
 5
 6         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 7         prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 8                   vma->anon_vma, vma->vm_file, pgoff, new_pol);
 9         if (prev) {
10             vma = prev;
11             next = vma->vm_next;
12             continue;
13         }
14         [snip]
15     }


               start                       end
     ...|--------|========|========|========|--------|...
        A prev B vma      C        D        E        F
             vma->start
                               First Attempt at Patching Kernel               9/19
Analysis - snip part
     split if merged out of range:
 1      vmstart = max(start, vma->vm_start);
 2      vmend = min(end, vma->vm_end);
 3      ...
 4
 5         if (vma->vm_start != vmstart) {
 6             err = split_vma(vma->vm_mm, vma, vmstart, 1);
 7             if (err)
 8                 goto out;
 9         }
10         if (vma->vm_end != vmend) {
11             err = split_vma(vma->vm_mm, vma, vmend, 0);
12             if (err)
13                 goto out;
14         }
15         err = policy_vma(vma, new_pol);
16         if (err)
17             goto out;




                                First Attempt at Patching Kernel   10/19
Analysis - vma merge()

 1     if (prev && prev->vm_end == addr &&
 2               mpol_equal(vma_policy(prev), policy) &&
 3               can_vma_merge_after(prev, vm_flags,
 4                         anon_vma, file, pgoff)) {
 5         [snip]
 6     }
 7     if (next && end == next->vm_start &&
 8              mpol_equal(policy, vma_policy(next)) &&
 9              can_vma_merge_before(next, vm_flags,
10                     anon_vma, file, pgoff+pglen)) {
11         [snip]
12     }


               start                       end
     ...|--------|========|========|========|--------|...
        A        B        C        D        E        F
           prev      vma     next
                    prev     vma      next

                               First Attempt at Patching Kernel   11/19
Analysis - can vma merge before()

 1   static int
 2   can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
 3       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 4   {
 5       if (is_mergeable_vma(vma, file, vm_flags) &&
 6           is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 7           if (vma->vm_pgoff == vm_pgoff)
 8               return 1;
 9       }
10       return 0;
11   }


               start                       end
     ...|--------|========|========|========|--------|...
        A        B        C vma    D        E        F
                          ˆvm_pgoff
     vma_merge(): (vma)     (next)
     vma_merge():ˆpgoff
     vma_merge():|-pglen -|

                                 First Attempt at Patching Kernel               12/19
Analysis - can vma merge after()

 1   static int
 2   can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 3       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 4   {
 5       if (is_mergeable_vma(vma, file, vm_flags) &&
 6           is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 7           pgoff_t vm_pglen;
 8           vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 9           if (vma->vm_pgoff + vm_pglen == vm_pgoff)
10               return 1;
11       }
12       return 0;
13   }

               start                       end
     ...|--------|========|========|========|--------|...
        A         B vma    C        D       E        F
                  |-pglen -|
                           ˆvm_pgoff
                  ˆvma->vm_pgoff
     vma_merge(): (prev)
     vma_merge():          ˆpgoff
                                 First Attempt at Patching Kernel              13/19
Analysis - tracing
 1   mempolicy.c:
 2           vmstart = max(start, vma->vm_start);
 3           vmend = min(end, vma->vm_end);
 4           prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 5                     vma->anon_vma, vma->vm_file, pgoff, new_pol);
 6
 7   mmap.c:
 8   struct vm_area_struct *vma_merge(struct mm_struct *mm,
 9               struct vm_area_struct *prev, unsigned long addr,
10               unsigned long end, unsigned long vm_flags,
11                    struct anon_vma *anon_vma, struct file *file,
12               pgoff_t pgoff, struct mempolicy *policy)
13   {
14       pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
15
16       can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen);
17       [snip]
18   }
19
20   static int
21   can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
22       struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
23   {
24       if (is_mergeable_vma(vma, file, vm_flags) &&
25           is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
26           if (vma->vm_pgoff == vm_pgoff)
27               return 1;
28       }
29       return 0;
30   }

                                          First Attempt at Patching Kernel      14/19
Analysis - tracing (cont.)
   Wrong:

       pgoff = vma->vm_pgoff +
               ((start - vma->vm_start) >> PAGE_SHIFT);
       pgoff + e - s == next->vm_pgoff ?

              start                       end
    ...|--------|========|========|========|--------|...
   off:0        1        2        3        4        5
       A        B        C        D        E        F
   1.           ˆ   vma                         not merge
   2.           ˆ        s   vma e next         not merge
                pgoff = 2 + (1 - 2) = 1
                pgoff + 3 - 2 = 2 != 3



                      First Attempt at Patching Kernel      15/19
Analysis - tracing (cont.)
   Right:

       pgoff = vma->vm_pgoff;
       pgoff + e - s == next->vm_pgoff ?

              start                       end
    ...|--------|========|========|========|--------|...
   off:0        1        2        3        4        5
       A        B        C        D        E        F
   1.           ˆ   vma                         not merge
   2.                    ˆs vma e next          merge!
                pgoff = 2
                pgoff + 3 - 2 = 3 == 3




                      First Attempt at Patching Kernel      16/19
Patchwork

 1   diff --git a/mm/mempolicy.c b/mm/mempolicy.c
 2   index 8b57173..b1f70d6 100644
 3   --- a/mm/mempolicy.c
 4   +++ b/mm/mempolicy.c
 5   @@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 6        struct vm_area_struct *prev;
 7        struct vm_area_struct *vma;
 8        int err = 0;
 9   -    pgoff_t pgoff;
10        unsigned long vmstart;
11        unsigned long vmend;
12
13   @@ -649,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
14            vmstart = max(start, vma->vm_start);
15            vmend = min(end, vma->vm_end);
16
17   -       pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
18           prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
19   -                 vma->anon_vma, vma->vm_file, pgoff, new_pol);
20   +                 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
21   +                 new_pol);
22           if (prev) {
23               vma = prev;
24               next = vma->vm_next;




                                          First Attempt at Patching Kernel                   17/19
Questions?




First Attempt at Patching Kernel   18/19
Thank you!




First Attempt at Patching Kernel   19/19