赞
踩
我们都知道Linux为了加速读写速度,采用了pagecache机制,用内存缓存磁盘内容,而buffer_head正是连接page和磁盘块的关键结构.
1. buffer_head是磁盘块的一个抽象,一个buffer_head对应一个磁盘块,buffer_head中保存对应的磁盘号
2. buffer_head把page与磁盘块联系起来,由于page和磁盘块的大小可能不一样,所以一个page可能管理多个buffer_head
这里假设page大小4K,块大小为1K, buffer_head,page和磁盘块关系如下:
这里以写文件为例说明page cache,buffer_head和磁盘块的映射
采用异步IO方式写文件时,会调用到generic_perform_write函数
- static ssize_t generic_perform_write(struct file *file,
- struct iov_iter *i, loff_t pos)
- {
-
- do {
- /*建立page,BH,磁盘块的映射关系 */
- status = a_ops->write_begin(file, mapping, pos, bytes, flags,
- &page, &fsdata);
- if (unlikely(status))
- break;
- /*复制用户数据到page */
- copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
- /*标记缓冲区为dirty,等待异步IO完成 */
- status = a_ops->write_end(file, mapping, pos, bytes, copied,
- } while (iov_iter_count(i));
-
- return written ? written : status;
- }
-
write_bengin和write_end会调用到具体文件系统的实现,这里以ext4为例ext4_write_begin:
- static int ext4_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
- {
- /*分配page cache */
- page = grab_cache_page_write_begin(mapping, index, flags);
- /*建立page cache与Buffer Head和磁盘块的联系,ext4_get_blok会分配实际的磁盘空间 */
- ret = __block_write_begin(page, pos, len, ext4_get_block);
-
- *pagep = page;
- return ret;
- }
每个inode都有一个address_space结构,不仅提供了文件系统层操作,还用一颗radix tree来管理inode所有page cache.
grab_cache_page_write_begin:首先会用index在mapping的radix tree中查找对应的page cache,找不到,创建新的页面.
而index表示page在文件中的偏移,单位是page_size。这里重点看__block_write_begin和ext4_get_block函数:
__block_write_begin:
1.给page分配buffer_head,由create_page_buffer完成
2.buffer_head与磁盘块的映射,由ext4_get_block完成
3.这里可能涉及到读磁盘,因为向page写入数据时,先要保证page已有的buffer数据与磁盘一致,否则会出现数据覆盖
- int __block_write_begin(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block)
- {
- /*给page创建buffer head */
- head = create_page_buffers(page, inode, 0);
- blocksize = head->b_size;
- bbits = block_size_bits(blocksize);
- /*文件索引,转换成文件内块号(这个不是磁盘块号) */
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
-
- for(bh = head, block_start = 0; bh != head || !block_start;
- block++, block_start=block_end, bh = bh->b_this_page) {
- block_end = block_start + blocksize;
- /*如果要写的区间[from,to]没有落到当前的bh范围,直接不处理 */
- if (block_end <= from || block_start >= to) {
- if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- }
- continue;
- }
- if (buffer_new(bh))
- clear_buffer_new(bh);
- /*给对应的bh分配磁盘空间 */
- if (!buffer_mapped(bh)) {
- WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
- if (err)
- break;
- }
- /*待写的page已经与磁盘内容一致,直接不处理 */
- if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- continue;
- }
- /*如果要写得区间[from,to]与磁盘不一致,需要从磁盘读数据 */
- if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
- !buffer_unwritten(bh) &&
- (block_start < from || block_end > to)) {
- ll_rw_block(READ, 1, &bh);/*更新pagecache内容,如果不更新,会存在数据覆盖 */
- *wait_bh++=bh;
- }
- }
- /*等待读完成 */
- while(wait_bh > wait) {
- wait_on_buffer(*--wait_bh);
- if (!buffer_uptodate(*wait_bh))
- err = -EIO;
- }
- return err;
- }
create_page_buffers判断当前page是否已分配buffer_head,否则调用create_empty_buffer创建buffer_head
此时创建完成的buffer_head并没有映射到具体的磁盘块
- void create_empty_buffers(struct page *page,
- unsigned long blocksize, unsigned long b_state)
- {
- struct buffer_head *bh, *head, *tail;
- /*分配buffer_head */
- head = alloc_page_buffers(page, blocksize, 1);
- bh = head;
- /*建立page下的buffer head为循环链表 */
- do {
- bh->b_state |= b_state;
- tail = bh;
- bh = bh->b_this_page;
- } while (bh);
- tail->b_this_page = head;
-
- /* page与buffer_head 关联 */
- attach_page_buffers(page, head);
- }
ext4_get_block主要分配磁盘空间,并调用map_bh建立buffer_head与磁盘块的映射
- static inline void
- map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
- {
- set_buffer_mapped(bh);
- bh->b_bdev = sb->s_bdev;
- bh->b_blocknr = block;
- bh->b_size = sb->s_blocksize;
- }
到这里,page,buffer_head和磁盘块的映射关系建立完成,之后的流程就是等待write数据异步写到磁盘
当buffer_head建立好后,就可以直接发起bio操作,这里以读流程来说明:
bh_submit_read是同步读函数,会等待buffer_head为unlock状态,
- int bh_submit_read(struct buffer_head *bh)
- {
- BUG_ON(!buffer_locked(bh));
- /*如果BH已经跟磁盘内容一致,则不需要发起BIO */
- if (buffer_uptodate(bh)) {
- unlock_buffer(bh);
- return 0;
- }
-
- get_bh(bh);
- /*设置回调函数 */
- bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh);
- /*等待BH为unlock状态 */
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return 0;
- return -EIO;
- }
submit_bh直接调用submit_bh_wbc函数发起bio
可以看到,到了bio层,就没有buffer head这个概念了,直接用page和bi_sector来操作对应的块
- /*提交bio */
- static int submit_bh_wbc(int rw, struct buffer_head *bh,
- unsigned long bio_flags, struct writeback_control *wbc)
- {
- struct bio *bio;
- int ret = 0;
- /*分配BIO */
- bio = bio_alloc(GFP_NOIO, 1);
-
- if (wbc) {
- wbc_init_bio(wbc, bio);
- wbc_account_io(wbc, bh->b_page, bh->b_size);
- }
- /*逻辑块号转换成扇区号 */
- bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio_set_dev(bio, bh->b_bdev);
- bio->bi_io_vec[0].bv_page = bh->b_page;
- bio->bi_io_vec[0].bv_len = bh->b_size;
- /*当前bh的页内偏移*/
- bio->bi_io_vec[0].bv_offset = bh_offset(bh);
-
- bio->bi_vcnt = 1;
- bio->bi_size = bh->b_size;
-
- bio->bi_end_io = end_bio_bh_io_sync;
- bio->bi_private = bh;
- bio->bi_flags |= bio_flags;
- /*对读写进行完全检查 */
- guard_bh_eod(rw, bio, bh);
-
- if (buffer_meta(bh))
- rw |= REQ_META;
- if (buffer_prio(bh))
- rw |= REQ_PRIO;
-
- bio_get(bio);
- /*提交一个bio */
- submit_bio(rw, bio);
-
- bio_put(bio);
- return ret;
- 3099,1 88%
4.buffer_head的状态
BH_Uptodate:表示BH的数据是最新的,甚至比磁盘还新(uptodate|dirtry)
BH_Dirty: BH数据是脏的,需要回刷到磁盘块
BH_Lock: BH正在进行IO操作
BH_Mapped: BH建立了磁盘映射
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。