文章目录

linux查看系统内存常用命令free中有2个重要的参数buffers和cached:

                     total       used       free     shared    buffers     cached
Mem:      16436192   10914412    5521780          0     502256    5795248
-/+ buffers/cache:    4616908   11819284
Swap:      4096532        116    4096416

这2个参数到底是什么意思,百度了各种文章,大多概念混淆、含糊不清,难以得到满意的答案,今天做一个源码探索,发现free原来就是获取/proc/meminfo中对应的参数。

执行 free;cat /proc/meminfo,两边的 buffers、cached完全一致

             total       used       free     shared    buffers     cached
Mem:      16436192   10914716    5521476          0     502256    5795248
-/+ buffers/cache:    4617212   11818980
Swap:      4096532        116    4096416
——————————————————————————————————————————————————————————————————————————
MemTotal:     16436192 kB
MemFree:       5521468 kB
Buffers:        502256 kB
Cached:        5795248 kB
SwapCached:         12 kB
Active:        6215352 kB
Inactive:      4218000 kB
HighTotal:           0 kB
.....
HugePages_Total:     0
HugePages_Free:      0
HugePages_Rsvd:      0
Hugepagesize:     2048 kB

不仅仅如此,top、vmstat、uptime等等一堆重要的命令中的数据都来自于/proc下的文件。

下面就聊聊源码的探索过程。

从网络资料可以探究到free是procps软件包的一部分,从网上下载procps源码(3.2.3),很容易看出free.c就是free的入口,main函数中meminfo()是关键

do {
    meminfo();
    printf("             total       used       free     shared    buffers     cached\n");
    printf(
        "%-7s %10Lu %10Lu %10Lu %10Lu %10Lu %10Lu\n", "Mem:",
        S(kb_main_total),
        S(kb_main_used),
        S(kb_main_free),
        S(kb_main_shared),
        S(kb_main_buffers),
        S(kb_main_cached)
    );
    // Print low vs. high information, if the user requested it.
    // Note we check if low_total==0: if so, then this kernel does
    // not export the low and high stats.  Note we still want to
    // print the high info, even if it is zero.
    if (show_high) {
        printf(
            "%-7s %10Lu %10Lu %10Lu\n", "Low:",
            S(kb_low_total),
            S(kb_low_total - kb_low_free),
            S(kb_low_free)
        );
        printf(
            "%-7s %10Lu %10Lu %10Lu\n", "High:",
            S(kb_high_total),
            S(kb_high_total - kb_high_free),
            S(kb_high_free)
        );
    }
    if(!old_fmt){
        unsigned KLONG buffers_plus_cached = kb_main_buffers + kb_main_cached;
        printf(
            "-/+ buffers/cache: %10Lu %10Lu\n", 
            S(kb_main_used - buffers_plus_cached),
            S(kb_main_free + buffers_plus_cached)
        );
    }
    printf(
        "%-7s %10Lu %10Lu %10Lu\n", "Swap:",
        S(kb_swap_total),
        S(kb_swap_used),
        S(kb_swap_free)
    );
    if(show_total){
        printf(
            "%-7s %10Lu %10Lu %10Lu\n", "Total:",
            S(kb_main_total + kb_swap_total),
            S(kb_main_used  + kb_swap_used),
            S(kb_main_free  + kb_swap_free)
        );
    }
    if(pause_length){
    fputc('\n', stdout);
    fflush(stdout);
    if (count != 1) usleep(pause_length);
}
} while(pause_length && --count);

meminfo()的代码在Sysinfo.c中

void meminfo(void){
  char namebuf[16]; /* big enough to hold any row name */
  mem_table_struct findme = { namebuf, NULL};
  mem_table_struct *found;
  char *head;
  char *tail;
  static const mem_table_struct mem_table[] = {
  {"Active",       &kb_active},       // important
  {"Buffers",      &kb_main_buffers}, // important
  {"Cached",       &kb_main_cached},  // important
  {"Committed_AS", &kb_committed_as},
  {"Dirty",        &kb_dirty},        // kB version of vmstat nr_dirty
  {"HighFree",     &kb_high_free},
  {"HighTotal",    &kb_high_total},
  {"Inact_clean",  &kb_inact_clean},
  {"Inact_dirty",  &kb_inact_dirty},
  {"Inact_laundry",&kb_inact_laundry},
  {"Inact_target", &kb_inact_target},
  {"Inactive",     &kb_inactive},     // important
  {"LowFree",      &kb_low_free},
  {"LowTotal",     &kb_low_total},
  {"Mapped",       &kb_mapped},       // kB version of vmstat nr_mapped
  {"MemFree",      &kb_main_free},    // important
  {"MemShared",    &kb_main_shared},  // important
  {"MemTotal",     &kb_main_total},   // important
  {"PageTables",   &kb_pagetables},   // kB version of vmstat nr_page_table_pages
  {"ReverseMaps",  &nr_reversemaps},  // same as vmstat nr_page_table_pages
  {"Slab",         &kb_slab},         // kB version of vmstat nr_slab
  {"SwapCached",   &kb_swap_cached},
  {"SwapFree",     &kb_swap_free},    // important
  {"SwapTotal",    &kb_swap_total},   // important
  {"Writeback",    &kb_writeback},    // kB version of vmstat nr_writeback
  };
  const int mem_table_count = sizeof(mem_table)/sizeof(mem_table_struct);

  FILE_TO_BUF(MEMINFO_FILE,meminfo_fd);

  kb_inactive = ~0UL;

  head = buf;
  for(;;){
    tail = strchr(head, ':');
    if(!tail) break;
    *tail = '\0';
    if(strlen(head) >= sizeof(namebuf)){
      head = tail+1;
      goto nextline;
    }
    strcpy(namebuf,head);
    found = bsearch(&findme, mem_table, mem_table_count,
        sizeof(mem_table_struct), compare_mem_table_structs
    );
    head = tail+1;
    if(!found) goto nextline;
    *(found->slot) = strtoul(head,&tail,10);
nextline:
    tail = strchr(head, '\n');
    if(!tail) break;
    head = tail+1;
  }
  if(!kb_low_total){  /* low==main except with large-memory support */
    kb_low_total = kb_main_total;
    kb_low_free  = kb_main_free;
  }
  if(kb_inactive==~0UL){
    kb_inactive = kb_inact_dirty + kb_inact_clean + kb_inact_laundry;
  }
  kb_swap_used = kb_swap_total - kb_swap_free;
  kb_main_used = kb_main_total - kb_main_free;
}

可以初步判断数据应该来源于函数中的FILE_TO_BUF(MEMINFO_FILE,meminfo_fd),从#define MEMINFO_FILE “/proc/meminfo” 可以判断出数据来源于操作系统的/proc/meminfo,就像procps官网的介绍那样 [procps is the package that has a bunch of small useful utilities that give information about processes using the /proc filesystem]。

下一步就是探究/proc/meminfo里面到底是啥,

从kerenl可以下载到 linux-2.6.32.65 版本源码,解压后有341M,对于一个没有linux源码阅读经验且C语言水平还停留在谭浩强时代的人来说,探索这样庞大的源码,是件看上去挺恐怖的事情。管不了这么多了,先搜索一下再说。

一番折腾后从Documentation\filesystems\proc.txt找到了相关的解释

Buffers: Relatively temporary storage for raw disk blocks
        shouldn't get tremendously large (20MB or so)
Cached: in-memory cache for files read from the disk (the
        pagecache).  Doesn't include SwapCached

下一步,尝试根据“MemTotal”搜索源码,可以找到源码位于fs\proc\Meminfo.c中,这是一个很幸运的开始。linux源码的目录结构还是挺清晰的,为各种查找和猜测提供了极大的遍历。

static int meminfo_proc_show(struct seq_file *m, void *v)
{
    struct sysinfo i;
    unsigned long committed;
    unsigned long allowed;
    struct vmalloc_info vmi;
    long cached;
    unsigned long pages[NR_LRU_LISTS];
    int lru;

/*
 * display in kilobytes.
 */
#define K(x) ((x) << (PAGE_SHIFT - 10))
    si_meminfo(&i);
    si_swapinfo(&i);
    committed = percpu_counter_read_positive(&vm_committed_as);
    allowed = ((totalram_pages - hugetlb_total_pages())
        * sysctl_overcommit_ratio / 100) + total_swap_pages;

    cached = global_page_state(NR_FILE_PAGES) -
            total_swapcache_pages - i.bufferram;
    if (cached < 0)
        cached = 0;

    get_vmalloc_info(&vmi);

    for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
        pages[lru] = global_page_state(NR_LRU_BASE + lru);

    /*
     * Tagged format, for easy grepping and expansion.
     */
    seq_printf(m,
        "MemTotal:       %8lu kB\n"
        "MemFree:        %8lu kB\n"
        "Buffers:        %8lu kB\n"
        "Cached:         %8lu kB\n"
        "SwapCached:     %8lu kB\n"
        "Active:         %8lu kB\n"
        "Inactive:       %8lu kB\n"
        "Active(anon):   %8lu kB\n"
        "Inactive(anon): %8lu kB\n"
        "Active(file):   %8lu kB\n"
        "Inactive(file): %8lu kB\n"
        "Unevictable:    %8lu kB\n"
        "Mlocked:        %8lu kB\n"
......
        );

    hugetlb_report_meminfo(m);

    arch_report_meminfo(m);

    return 0;
#undef K
}

从源码中可以看出

"Buffers:        %8lu kB\n"
"Cached:         %8lu kB\n"

对应的是

K(i.bufferram),
K(cached),

首先看buffers,i 是 struct sysinfo(定义在 include\linux\kernel.h中),从si_meminfo 和 si_swapinfo 中获取数据。

struct sysinfo {
    long uptime;            /* Seconds since boot */
    unsigned long loads[3];        /* 1, 5, and 15 minute load averages */
    unsigned long totalram;        /* Total usable main memory size */
    unsigned long freeram;        /* Available memory size */
    unsigned long sharedram;    /* Amount of shared memory */
    unsigned long bufferram;    /* Memory used by buffers */
    unsigned long totalswap;    /* Total swap space size */
    unsigned long freeswap;        /* swap space still available */
    unsigned short procs;        /* Number of current processes */
    unsigned short pad;        /* explicit padding for m68k */
    unsigned long totalhigh;    /* Total high memory size */
    unsigned long freehigh;        /* Available high memory size */
    unsigned int mem_unit;        /* Memory unit size in bytes */
    char _f[20-2*sizeof(long)-sizeof(int)];    /* Padding: libc5 uses this.. */
};

在mm\page_alloc.c中可以找到:

void si_meminfo(struct sysinfo *val)
{
    val->totalram = totalram_pages;
    val->sharedram = 0;
    val->freeram = global_page_state(NR_FREE_PAGES);
    val->bufferram = nr_blockdev_pages();
    val->totalhigh = totalhigh_pages;
    val->freehigh = nr_free_highpages();
    val->mem_unit = PAGE_SIZE;
}

在mm\Swapfile.c中可以找到:

void si_swapinfo(struct sysinfo *val)
{
    unsigned int i;
    unsigned long nr_to_be_unused = 0;

    spin_lock(&swap_lock);
    for (i = 0; i < nr_swapfiles; i++) {
        if (!(swap_info[i].flags & SWP_USED) ||
             (swap_info[i].flags & SWP_WRITEOK))
            continue;
        nr_to_be_unused += swap_info[i].inuse_pages;
    }
    val->freeswap = nr_swap_pages + nr_to_be_unused;
    val->totalswap = total_swap_pages + nr_to_be_unused;
    spin_unlock(&swap_lock);
}

可以看出 Buffers 只跟 nr_blockdev_pages() 有关系,代码在 fs\Block_dev.c中:

long nr_blockdev_pages(void)
{
    struct block_device *bdev;
    long ret = 0;
    spin_lock(&bdev_lock);
    list_for_each_entry(bdev, &all_bdevs, bd_list) {
        ret += bdev->bd_inode->i_mapping->nrpages;
    }
    spin_unlock(&bdev_lock);
    return ret;
}

原来Buffers跟inode有关系。

include\linux\Fs.h有相关结构的定义

struct block_device {
    dev_t            bd_dev;  /* not a kdev_t - it's a search key */
    struct inode *        bd_inode;    /* will die */
    struct super_block *    bd_super;
    int            bd_openers;
    struct mutex        bd_mutex;    /* open/close mutex */
    struct list_head    bd_inodes;
    void *            bd_holder;
    int            bd_holders;
#ifdef CONFIG_SYSFS
    struct list_head    bd_holder_list;
#endif
    struct block_device *    bd_contains;
    unsigned        bd_block_size;
    struct hd_struct *    bd_part;
    /* number of times partitions within this device have been opened. */
    unsigned        bd_part_count;
    int            bd_invalidated;
    struct gendisk *    bd_disk;
    struct list_head    bd_list;
    /*
     * Private data.  You must have bd_claim'ed the block_device
     * to use this.  NOTE:  bd_claim allows an owner to claim
     * the same device multiple times, the owner must take special
     * care to not mess up bd_private for that case.
     */
    unsigned long        bd_private;

    /* The counter of freeze processes */
    int            bd_fsfreeze_count;
    /* Mutex for freeze */
    struct mutex        bd_fsfreeze_mutex;
};

struct inode {
    struct hlist_node    i_hash;
    struct list_head    i_list;        /* backing dev IO list */
    struct list_head    i_sb_list;
    struct list_head    i_dentry;
    unsigned long        i_ino;
    atomic_t        i_count;
    unsigned int        i_nlink;
    uid_t            i_uid;
    gid_t            i_gid;
    dev_t            i_rdev;
    u64            i_version;
    loff_t            i_size;
......
    void            *i_private; /* fs or device private pointer */
};

struct address_space {
    struct inode        *host;        /* owner: inode, block_device */
    struct radix_tree_root    page_tree;    /* radix tree of all pages */
    spinlock_t        tree_lock;    /* and lock protecting it */
    unsigned int        i_mmap_writable;/* count VM_SHARED mappings */
    struct prio_tree_root    i_mmap;        /* tree of private and shared mappings */
    struct list_head    i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
    spinlock_t        i_mmap_lock;    /* protect tree, count, list */
    unsigned int        truncate_count;    /* Cover race condition with truncate */
    unsigned long        nrpages;    /* number of total pages */
    pgoff_t            writeback_index;/* writeback starts here */
    const struct address_space_operations *a_ops;    /* methods */
    unsigned long        flags;        /* error bits/gfp mask */
    struct backing_dev_info *backing_dev_info; /* device readahead, etc */
    spinlock_t        private_lock;    /* for use by the address_space */
    struct list_head    private_list;    /* ditto */
    struct address_space    *assoc_mapping;    /* ditto */
    struct mutex        unmap_mutex;    /* to protect unmapping */
} __attribute__((aligned(sizeof(long))));

从 nrpages 的注释可以清楚的看出 Buffers是inode在内存中总页数。

接下来看cached,meminfo_proc_show中计算公式如下:

cached = global_page_state(NR_FILE_PAGES) - total_swapcache_pages - i.bufferram;

static inline unsigned long global_page_state(enum zone_stat_item item)
{
    long x = atomic_long_read(&vm_stat[item]);
#ifdef CONFIG_SMP
    if (x < 0)
        x = 0;
#endif
    return x;
}

vm_stat 可以在 mm/Vmstat.h中找到:

static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
    atomic_long_inc(&zone->vm_stat[item]);
    atomic_long_inc(&vm_stat[item]);
}

static inline void zone_page_state_add(long x, struct zone *zone,
                 enum zone_stat_item item)
{
    atomic_long_add(x, &zone->vm_stat[item]);
    atomic_long_add(x, &vm_stat[item]);
}

static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
    atomic_long_dec(&zone->vm_stat[item]);
    atomic_long_dec(&vm_stat[item]);
}

也就是说zone->vm_stat的变化会引起vm_stat的变化。

今天跟踪到这里,得空继续探索。

文章目录