free中的buffers和cached(1)
linux查看系统内存常用命令free中有2个重要的参数buffers和cached:
total used free shared buffers cached
Mem: 16436192 10914412 5521780 0 502256 5795248
-/+ buffers/cache: 4616908 11819284
Swap: 4096532 116 4096416
这2个参数到底是什么意思,百度了各种文章,大多概念混淆、含糊不清,难以得到满意的答案,今天做一个源码探索,发现free原来就是获取/proc/meminfo中对应的参数。
执行 free;cat /proc/meminfo,两边的 buffers、cached完全一致
total used free shared buffers cached
Mem: 16436192 10914716 5521476 0 502256 5795248
-/+ buffers/cache: 4617212 11818980
Swap: 4096532 116 4096416
——————————————————————————————————————————————————————————————————————————
MemTotal: 16436192 kB
MemFree: 5521468 kB
Buffers: 502256 kB
Cached: 5795248 kB
SwapCached: 12 kB
Active: 6215352 kB
Inactive: 4218000 kB
HighTotal: 0 kB
.....
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
Hugepagesize: 2048 kB
不仅仅如此,top、vmstat、uptime等等一堆重要的命令中的数据都来自于/proc下的文件。
下面就聊聊源码的探索过程。
从网络资料可以探究到free是procps软件包的一部分,从网上下载procps源码(3.2.3),很容易看出free.c就是free的入口,main函数中meminfo()是关键
do {
meminfo();
printf(" total used free shared buffers cached\n");
printf(
"%-7s %10Lu %10Lu %10Lu %10Lu %10Lu %10Lu\n", "Mem:",
S(kb_main_total),
S(kb_main_used),
S(kb_main_free),
S(kb_main_shared),
S(kb_main_buffers),
S(kb_main_cached)
);
// Print low vs. high information, if the user requested it.
// Note we check if low_total==0: if so, then this kernel does
// not export the low and high stats. Note we still want to
// print the high info, even if it is zero.
if (show_high) {
printf(
"%-7s %10Lu %10Lu %10Lu\n", "Low:",
S(kb_low_total),
S(kb_low_total - kb_low_free),
S(kb_low_free)
);
printf(
"%-7s %10Lu %10Lu %10Lu\n", "High:",
S(kb_high_total),
S(kb_high_total - kb_high_free),
S(kb_high_free)
);
}
if(!old_fmt){
unsigned KLONG buffers_plus_cached = kb_main_buffers + kb_main_cached;
printf(
"-/+ buffers/cache: %10Lu %10Lu\n",
S(kb_main_used - buffers_plus_cached),
S(kb_main_free + buffers_plus_cached)
);
}
printf(
"%-7s %10Lu %10Lu %10Lu\n", "Swap:",
S(kb_swap_total),
S(kb_swap_used),
S(kb_swap_free)
);
if(show_total){
printf(
"%-7s %10Lu %10Lu %10Lu\n", "Total:",
S(kb_main_total + kb_swap_total),
S(kb_main_used + kb_swap_used),
S(kb_main_free + kb_swap_free)
);
}
if(pause_length){
fputc('\n', stdout);
fflush(stdout);
if (count != 1) usleep(pause_length);
}
} while(pause_length && --count);
meminfo()的代码在Sysinfo.c中
void meminfo(void){
char namebuf[16]; /* big enough to hold any row name */
mem_table_struct findme = { namebuf, NULL};
mem_table_struct *found;
char *head;
char *tail;
static const mem_table_struct mem_table[] = {
{"Active", &kb_active}, // important
{"Buffers", &kb_main_buffers}, // important
{"Cached", &kb_main_cached}, // important
{"Committed_AS", &kb_committed_as},
{"Dirty", &kb_dirty}, // kB version of vmstat nr_dirty
{"HighFree", &kb_high_free},
{"HighTotal", &kb_high_total},
{"Inact_clean", &kb_inact_clean},
{"Inact_dirty", &kb_inact_dirty},
{"Inact_laundry",&kb_inact_laundry},
{"Inact_target", &kb_inact_target},
{"Inactive", &kb_inactive}, // important
{"LowFree", &kb_low_free},
{"LowTotal", &kb_low_total},
{"Mapped", &kb_mapped}, // kB version of vmstat nr_mapped
{"MemFree", &kb_main_free}, // important
{"MemShared", &kb_main_shared}, // important
{"MemTotal", &kb_main_total}, // important
{"PageTables", &kb_pagetables}, // kB version of vmstat nr_page_table_pages
{"ReverseMaps", &nr_reversemaps}, // same as vmstat nr_page_table_pages
{"Slab", &kb_slab}, // kB version of vmstat nr_slab
{"SwapCached", &kb_swap_cached},
{"SwapFree", &kb_swap_free}, // important
{"SwapTotal", &kb_swap_total}, // important
{"Writeback", &kb_writeback}, // kB version of vmstat nr_writeback
};
const int mem_table_count = sizeof(mem_table)/sizeof(mem_table_struct);
FILE_TO_BUF(MEMINFO_FILE,meminfo_fd);
kb_inactive = ~0UL;
head = buf;
for(;;){
tail = strchr(head, ':');
if(!tail) break;
*tail = '\0';
if(strlen(head) >= sizeof(namebuf)){
head = tail+1;
goto nextline;
}
strcpy(namebuf,head);
found = bsearch(&findme, mem_table, mem_table_count,
sizeof(mem_table_struct), compare_mem_table_structs
);
head = tail+1;
if(!found) goto nextline;
*(found->slot) = strtoul(head,&tail,10);
nextline:
tail = strchr(head, '\n');
if(!tail) break;
head = tail+1;
}
if(!kb_low_total){ /* low==main except with large-memory support */
kb_low_total = kb_main_total;
kb_low_free = kb_main_free;
}
if(kb_inactive==~0UL){
kb_inactive = kb_inact_dirty + kb_inact_clean + kb_inact_laundry;
}
kb_swap_used = kb_swap_total - kb_swap_free;
kb_main_used = kb_main_total - kb_main_free;
}
可以初步判断数据应该来源于函数中的FILE_TO_BUF(MEMINFO_FILE,meminfo_fd),从#define MEMINFO_FILE “/proc/meminfo” 可以判断出数据来源于操作系统的/proc/meminfo,就像procps官网的介绍那样 [procps is the package that has a bunch of small useful utilities that give information about processes using the /proc filesystem]。
下一步就是探究/proc/meminfo里面到底是啥,
从kerenl可以下载到 linux-2.6.32.65 版本源码,解压后有341M,对于一个没有linux源码阅读经验且C语言水平还停留在谭浩强时代的人来说,探索这样庞大的源码,是件看上去挺恐怖的事情。管不了这么多了,先搜索一下再说。
一番折腾后从Documentation\filesystems\proc.txt找到了相关的解释
Buffers: Relatively temporary storage for raw disk blocks
shouldn't get tremendously large (20MB or so)
Cached: in-memory cache for files read from the disk (the
pagecache). Doesn't include SwapCached
下一步,尝试根据“MemTotal”搜索源码,可以找到源码位于fs\proc\Meminfo.c中,这是一个很幸运的开始。linux源码的目录结构还是挺清晰的,为各种查找和猜测提供了极大的遍历。
static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
unsigned long committed;
unsigned long allowed;
struct vmalloc_info vmi;
long cached;
unsigned long pages[NR_LRU_LISTS];
int lru;
/*
* display in kilobytes.
*/
#define K(x) ((x) << (PAGE_SHIFT - 10))
si_meminfo(&i);
si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
allowed = ((totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100) + total_swap_pages;
cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages - i.bufferram;
if (cached < 0)
cached = 0;
get_vmalloc_info(&vmi);
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);
/*
* Tagged format, for easy grepping and expansion.
*/
seq_printf(m,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
"Active: %8lu kB\n"
"Inactive: %8lu kB\n"
"Active(anon): %8lu kB\n"
"Inactive(anon): %8lu kB\n"
"Active(file): %8lu kB\n"
"Inactive(file): %8lu kB\n"
"Unevictable: %8lu kB\n"
"Mlocked: %8lu kB\n"
......
);
hugetlb_report_meminfo(m);
arch_report_meminfo(m);
return 0;
#undef K
}
从源码中可以看出
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
对应的是
K(i.bufferram),
K(cached),
首先看buffers,i 是 struct sysinfo(定义在 include\linux\kernel.h中),从si_meminfo 和 si_swapinfo 中获取数据。
struct sysinfo {
long uptime; /* Seconds since boot */
unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
unsigned long totalram; /* Total usable main memory size */
unsigned long freeram; /* Available memory size */
unsigned long sharedram; /* Amount of shared memory */
unsigned long bufferram; /* Memory used by buffers */
unsigned long totalswap; /* Total swap space size */
unsigned long freeswap; /* swap space still available */
unsigned short procs; /* Number of current processes */
unsigned short pad; /* explicit padding for m68k */
unsigned long totalhigh; /* Total high memory size */
unsigned long freehigh; /* Available high memory size */
unsigned int mem_unit; /* Memory unit size in bytes */
char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */
};
在mm\page_alloc.c中可以找到:
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
val->sharedram = 0;
val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
}
在mm\Swapfile.c中可以找到:
void si_swapinfo(struct sysinfo *val)
{
unsigned int i;
unsigned long nr_to_be_unused = 0;
spin_lock(&swap_lock);
for (i = 0; i < nr_swapfiles; i++) {
if (!(swap_info[i].flags & SWP_USED) ||
(swap_info[i].flags & SWP_WRITEOK))
continue;
nr_to_be_unused += swap_info[i].inuse_pages;
}
val->freeswap = nr_swap_pages + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
spin_unlock(&swap_lock);
}
可以看出 Buffers 只跟 nr_blockdev_pages() 有关系,代码在 fs\Block_dev.c中:
long nr_blockdev_pages(void)
{
struct block_device *bdev;
long ret = 0;
spin_lock(&bdev_lock);
list_for_each_entry(bdev, &all_bdevs, bd_list) {
ret += bdev->bd_inode->i_mapping->nrpages;
}
spin_unlock(&bdev_lock);
return ret;
}
原来Buffers跟inode有关系。
include\linux\Fs.h有相关结构的定义
struct block_device {
dev_t bd_dev; /* not a kdev_t - it's a search key */
struct inode * bd_inode; /* will die */
struct super_block * bd_super;
int bd_openers;
struct mutex bd_mutex; /* open/close mutex */
struct list_head bd_inodes;
void * bd_holder;
int bd_holders;
#ifdef CONFIG_SYSFS
struct list_head bd_holder_list;
#endif
struct block_device * bd_contains;
unsigned bd_block_size;
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
int bd_invalidated;
struct gendisk * bd_disk;
struct list_head bd_list;
/*
* Private data. You must have bd_claim'ed the block_device
* to use this. NOTE: bd_claim allows an owner to claim
* the same device multiple times, the owner must take special
* care to not mess up bd_private for that case.
*/
unsigned long bd_private;
/* The counter of freeze processes */
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
};
struct inode {
struct hlist_node i_hash;
struct list_head i_list; /* backing dev IO list */
struct list_head i_sb_list;
struct list_head i_dentry;
unsigned long i_ino;
atomic_t i_count;
unsigned int i_nlink;
uid_t i_uid;
gid_t i_gid;
dev_t i_rdev;
u64 i_version;
loff_t i_size;
......
void *i_private; /* fs or device private pointer */
};
struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
spinlock_t tree_lock; /* and lock protecting it */
unsigned int i_mmap_writable;/* count VM_SHARED mappings */
struct prio_tree_root i_mmap; /* tree of private and shared mappings */
struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
spinlock_t i_mmap_lock; /* protect tree, count, list */
unsigned int truncate_count; /* Cover race condition with truncate */
unsigned long nrpages; /* number of total pages */
pgoff_t writeback_index;/* writeback starts here */
const struct address_space_operations *a_ops; /* methods */
unsigned long flags; /* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
struct mutex unmap_mutex; /* to protect unmapping */
} __attribute__((aligned(sizeof(long))));
从 nrpages 的注释可以清楚的看出 Buffers是inode在内存中总页数。
接下来看cached,meminfo_proc_show中计算公式如下:
cached = global_page_state(NR_FILE_PAGES) - total_swapcache_pages - i.bufferram;
static inline unsigned long global_page_state(enum zone_stat_item item)
{
long x = atomic_long_read(&vm_stat[item]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
#endif
return x;
}
vm_stat 可以在 mm/Vmstat.h中找到:
static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
atomic_long_inc(&zone->vm_stat[item]);
atomic_long_inc(&vm_stat[item]);
}
static inline void zone_page_state_add(long x, struct zone *zone,
enum zone_stat_item item)
{
atomic_long_add(x, &zone->vm_stat[item]);
atomic_long_add(x, &vm_stat[item]);
}
static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
atomic_long_dec(&zone->vm_stat[item]);
atomic_long_dec(&vm_stat[item]);
}
也就是说zone->vm_stat的变化会引起vm_stat的变化。
今天跟踪到这里,得空继续探索。