赞
踩
注:本文分析基于linux-4.18.0-193.14.2.el8_2内核版本,即CentOS 8.2
当进程直接读写块设备时,比如超级块和索引节点,就需要把块数据放入内存,我们上一篇讲page cache是将文件数据放入内存,因此不适用,这时就需要用到块缓冲区。每个块缓冲区都对应一个buffer_head类型的缓冲区首部描述符。数据依然存放在page页面中,只不过由buffer_head管理,这种情况下page页面被称为缓冲区页。
struct buffer_head { unsigned long b_state; //buffer的状态 struct buffer_head *b_this_page;//该page的下一个buffer struct page *b_page; //buffer所在page sector_t b_blocknr; //相对于block device起始位置的logical block number size_t b_size; /* size of mapping */ char *b_data; //指向数据在page中的位置 struct block_device *b_bdev; //对应的块设备 bh_end_io_t *b_end_io; /* I/O completion */ void *b_private; /* reserved for b_end_io */ struct list_head b_assoc_buffers; /* associated with another mapping */ struct address_space *b_assoc_map; /* mapping this buffer is associated with */ atomic_t b_count; /* users using this buffer_head */ };
可见,buffer_head描述的是磁盘block和内存buffer之间的映射关系。
缓冲区页的使用主要有一下两种场景,
我们先来看下第一种情况,我们以ext4文件系统的mkdir来大概梳理下流程,
ext4_mkdir -> ext4_new_inode_start_handle -> __ext4_new_inode -> ------------------------------------------- //为新目录分配inode索引 new_inode -> ext4_read_inode_bitmap -> --------------------------------- //获取磁盘的inode位图 sb_getblk -> ------------------------------------------ //读取块设备数据 __getblk_gfp -> __find_get_block -> lookup_bh_lru ----------------------------- //在每CPU变量bh_lrus中查找BH __find_get_block_slow -> ------------------ //bh_lrus没找到就要到对应的page cache中查找页面 find_get_page_flags -> pagecache_get_page -> ------------- //查找page cache find_get_entry ---------------- //根据bdev->bd_inode->i_mapping地址空间在page cache基树中查找页面 page_buffers ------------------ //找到page cache看是否有对应的buffer_head,没有则返回NULL bh_lru_install ---------------------------- //如果有找到,把找到的bh放入每CPU bh_lrus中,提高访问速度 __getblk_slow -> ------------------------------ //CPU变量bh_lrus和page cache中都没有找到目标BH,就需要从块设备读取了 grow_buffers -> grow_dev_page -> find_or_create_page -> pagecache_get_page -> --------- //根据bdev->bd_inode->i_mapping地址空间在page cache基树中查找页面 find_get_entry __page_cache_alloc -------- //没找到page cache,创建新页面 add_to_page_cache_lru ----- //并加入page cache基树,以及LRU链表 alloc_page_buffers -> ------------- //找到或新建的页面没有buffer_head,创建新的buffer_head alloc_buffer_head -> kmem_cache_zalloc(bh_cachep //在slab中分配空闲buffer_head对象 set_bh_page ------------------- //设置buffer的数据指向地址 link_dev_buffers -> --------------- //将page对应的所有buffer_head连成一个环形链表 attach_page_buffers ----------- //将buffer关联到对应的page上,将page->private指向buffer_head
分配buffer_head的操作在alloc_page_buffers,前提是page cache没有对应的buffer_head。
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry) { ... head = NULL; offset = PAGE_SIZE; //以页大小作为总量,分配buffer_head,而buffer大小由blocksize决定 //因此对于blocksize为4k的设备,此时就只会分配一个buffer_head //对于blocksize为1k的设备,就会分配4个buffer_head while ((offset -= size) >= 0) { //从slab缓存中分配空闲buffer_head结构 bh = alloc_buffer_head(gfp); if (!bh) goto no_grow; //对于分配多个buffer_head的场景,通过b_this_page将其连接起来 bh->b_this_page = head; bh->b_blocknr = -1; head = bh; bh->b_size = size; set_bh_page(bh, page, offset);//设置buffer对应的数据地址 } ... return head; //链表头,即最后分配的buffer_head ... } void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset) { bh->b_page = page; //b_page指向对应页面 BUG_ON(offset >= PAGE_SIZE); if (PageHighMem(page)) bh->b_data = (char *)(0 + offset); else bh->b_data = page_address(page) + offset; //设置数据指向地址 }
alloc_page_buffers返回上层函数后,还需要link_dev_buffers进一步处理,
static inline void link_dev_buffers(struct page *page, struct buffer_head *head) { struct buffer_head *bh, *tail; //将page对应的所有buffer_head连成一个环形链表 bh = head; do { tail = bh; bh = bh->b_this_page; } while (bh); tail->b_this_page = head; //将buffer关联到对应的page上 attach_page_buffers(page, head); } static inline void attach_page_buffers(struct page *page, struct buffer_head *head) { get_page(page); SetPagePrivate(page); //设置PG_Private标志,表示有page有对应fs的数据,即buffer set_page_private(page, (unsigned long)head); //将page->private指向buffer_head }
在上篇文章——页缓存page cache和地址空间address_space中,我们知道如果page cache没有缓存,会调用readpage去磁盘读取数据,对于ext4调用的就是ext4_readpage,ext4_readpage其实是对ext4_mpage_readpages的简单封装,
int ext4_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages, bool is_readahead) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; //对于扇区大小为512字节的磁盘,该值为9 const unsigned blkbits = inode->i_blkbits; //对于扇区大小为512字节的磁盘,该值为8,即每个页面对应8个磁盘块 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; //物理磁盘块大小,即扇区大小,为512字节 ... for (; nr_pages; nr_pages--) { int fully_mapped = 1; unsigned first_hole = blocks_per_page; prefetchw(&page->flags); //如果page有关联的buffer_head,那继续以块的方式读取 if (page_has_buffers(page)) goto confused; block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);//当前page在file中的相对block last_block = block_in_file + nr_pages * blocks_per_page; //需要读取的最后一个block last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; //该文件的最后一个block if (last_block > last_block_in_file) last_block = last_block_in_file; //读取的block不能超过文件最后一个block page_block = 0; ... //调用ext4_map_blocks查找该页需要的所有磁盘块 while (page_block < blocks_per_page) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; //读取长度 //从磁盘查找块 if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { ... } } ... //两次读取的block是否相邻,不相邻则通过一次读一块的方式读取 //但是对于4k盘,即物理块扇区大小为4k,和page大小一致时,就不存在是否相邻的问题,因为一个页就对应一个block if (page_block && blocks[page_block-1] != map.m_pblk-1) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } else if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk+relative_block; page_block++; block_in_file++; } } ... confused: ... if (!PageUptodate(page)) //通过buffer_head,一次一块读取文件 block_read_full_page(page, ext4_get_block); ... } ... return 0; }
然后block_read_full_page就会调用create_empty_buffers创建buffer_head,和上面直接读取块设备一样,最后读取磁盘的文件数据。
对于ext4文件系统,写操作都会经过buffer_head,
SYSCALL_DEFINE3(write //write系统调用入口 ksys_write vfs_write __vfs_write new_sync_write call_write_iter file->f_op->write_iter ext4_file_write_iter __generic_file_write_iter generic_perform_write a_ops->write_begin ext4_write_begin grab_cache_page_write_begin -> pagecache_get_page -> find_get_entry ---------------- //查找page cache __page_cache_alloc ------------ //没找到page cache,则分配一个page对象 add_to_page_cache_lru --------- //将页面加入page cache基树中,同时也加入active LRU链表 __block_write_begin __block_write_begin_int create_page_buffers create_empty_buffers ------ //page没有对应的buffer,创建新的buffer alloc_page_buffers alloc_buffer_head kmem_cache_zalloc(bh_cachep //从slab缓存中分配空闲buffer_head结构 set_bh_page ------- //设置buffer_head数据指向地址 attach_page_buffers iov_iter_copy_from_user_atomic ---------------- //将数据从用户空间拷贝到内核空间,也就是page cache上 a_ops->write_end ext4_write_end block_write_end __block_commit_write mark_buffer_dirty ext4_update_inode_size ---------------- //更新文件对应的inode信息 ext4_mark_inode_dirty ----------------- //标记inode为脏,写入了数据,需要同步到磁盘
因为buffer_head和page关联,因此在回收page的时候会同时回收buffer_head,同样在上篇文章——页缓存page cache和地址空间address_space中我们提到drop_caches时,针对ext4文件系统会调用ext4_releasepage对资源进行释放,
ext4_releasepage ->
jbd2_journal_try_to_free_buffers ->
try_to_free_buffers ->
drop_buffers ->
__clear_page_buffers //清空page->private的指向
free_buffer_head ->
kmem_cache_free(bh_cachep //将buffer_head释放回slab缓存
以写/home/test.c文件为例,
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。