赞
踩
xv6文件系统实现分为七层
文件描述符(File descriptor) |
---|
路径名(Pathname) |
目录(Directory) |
索引结点(Inode) |
日志(Logging) |
缓冲区高速缓存(Buffer cache) |
磁盘(Disk) |
文件系统必须有将索引节点和内容块存储在磁盘上哪些位置的方案。为此,xv6将磁盘划分为几个部分,如图8.2所示。
boot:文件系统不使用块0(它保存引导扇区)
super:它包含有关文件系统的元数据(文件系统大小(以块为单位)、数据块数、索引节点数和日志中的块数)。
log:保存日志(用于出现crash后进行数据恢复)
inodes: 是索引节点,每个块有多个索引节点
bit map:位图,跟踪正在使用的数据块,用来保存那些块是被使用或空闲的。
data:剩下的就是数据块,是否被占用保存在bitmap中
buffe catch 的两个任务
对上提供的接口
通过给每一个 buffer 分配一个 sleeplock 的方式实现一个磁盘块的拷贝只能被一个内核线程使用
Buffer cache中保存磁盘块的缓冲区数量固定,如果文件系统请求还未存放在缓存中的块,Buffer cache必须回收当前保存其他块内容的缓冲区。Buffer cache为新块回收最近使用最少的缓冲区。
Buffer cache是以双链表表示的缓冲区(为了实现LRU替换算法)
struct buf {
int valid; // has data been read from disk?
int disk; // does disk "own" buf?
uint dev;
uint blockno;
struct sleeplock lock;
uint refcnt;
struct buf *prev; // LRU cache list
struct buf *next;
uchar data[BSIZE];
};
main(kernel/main.c)调用的函数binit使用静态数组buf(kernel/bio.c)中的NBUF个缓冲区初始化列表。对Buffer cache的所有其他访问都通过bcache.head引用链表,而不是buf数组。
struct {
struct spinlock lock;
struct buf buf[NBUF];
// Linked list of all buffers, through prev/next.
// Sorted by how recently the buffer was used.
// head.next is most recent, head.prev is least.
struct buf head;
} bcache;
binit函数
void binit(void) { struct buf *b; initlock(&bcache.lock, "bcache"); // Create linked list of buffers bcache.head.prev = &bcache.head; bcache.head.next = &bcache.head; for(b = bcache.buf; b < bcache.buf+NBUF; b++){ b->next = bcache.head.next; b->prev = &bcache.head; initsleeplock(&b->lock, "buffer"); bcache.head.next->prev = b; bcache.head.next = b; } }
bget函数
bget扫描缓冲区列表,查找具有给定设备和扇区号的缓冲区。
static struct buf* bget(uint dev, uint blockno) { struct buf *b; acquire(&bcache.lock); // Is the block already cached? for(b = bcache.head.next; b != &bcache.head; b = b->next){ if(b->dev == dev && b->blockno == blockno){ b->refcnt++; release(&bcache.lock); acquiresleep(&b->lock); return b; } } // Not cached. // Recycle the least recently used (LRU) unused buffer. for(b = bcache.head.prev; b != &bcache.head; b = b->prev){ if(b->refcnt == 0) { b->dev = dev; b->blockno = blockno; b->valid = 0; b->refcnt = 1; release(&bcache.lock); acquiresleep(&b->lock); return b; } } panic("bget: no buffers"); }
bread函数
// Return a locked buf with the contents of the indicated block.
struct buf*
bread(uint dev, uint blockno)
{
struct buf *b;
b = bget(dev, blockno);
if(!b->valid) {
virtio_disk_rw(b, 0);
b->valid = 1;
}
return b;
}
bwrite函数
该函数作用,将内存(buffer中)的数据写回到磁盘
// Write b's contents to disk. Must be locked.
void
bwrite(struct buf *b)
{
if(!holdingsleep(&b->lock))
panic("bwrite");
virtio_disk_rw(b, 1);
}
目的
解决了文件系统操作期间的崩溃问题
解决方法
log设计
struct logheader {
int n;
int block[LOGSIZE];
};
下面看一下log结构体
struct log {
struct spinlock lock;
int start;
int size;
int outstanding; // 有多少FS系统调用正在执行调用正在执行
int committing; // 是否在commit当中
int dev;
struct logheader lh;
};
在系统调用中一个典型的日志使用就像这样:
begin_op();
...
bp = bread(...);
bp->data[...] = ...;
log_write(bp);
...
end_op();
下面我们就上面过程,进行逐一分析
begin_op函数
void begin_op(void) { acquire(&log.lock); while(1){ if(log.committing){ sleep(&log, &log.lock); } else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE){ // this op might exhaust log space; wait for commit. sleep(&log, &log.lock); } else { log.outstanding += 1; release(&log.lock); break; } } }
log_write函数
void log_write(struct buf *b) { int i; acquire(&log.lock); if (log.lh.n >= LOGSIZE || log.lh.n >= log.size - 1) panic("too big a transaction"); if (log.outstanding < 1) panic("log_write outside of trans"); for (i = 0; i < log.lh.n; i++) { if (log.lh.block[i] == b->blockno) // log absorption break; } log.lh.block[i] = b->blockno; if (i == log.lh.n) { // Add new block to log? bpin(b); log.lh.n++; } release(&log.lock); }
end_op函数
// called at the end of each FS system call. // commits if this was the last outstanding operation. void end_op(void) { int do_commit = 0; acquire(&log.lock); log.outstanding -= 1; if(log.committing) panic("log.committing"); if(log.outstanding == 0){ do_commit = 1; log.committing = 1; } else { // begin_op() may be waiting for log space, // and decrementing log.outstanding has decreased // the amount of reserved space. wakeup(&log); } release(&log.lock); if(do_commit){ // call commit w/o holding locks, since not allowed // to sleep with locks. commit(); acquire(&log.lock); log.committing = 0; wakeup(&log); release(&log.lock); } }
commit函数
commit有四个步骤,代码如下
static void
commit()
{
if (log.lh.n > 0) {
write_log(); // Write modified blocks from cache to log
write_head(); // Write header to disk -- the real commit
install_trans(0); // Now install writes to home locations
log.lh.n = 0;
write_head(); // Erase the transaction from the log
}
}
write_log()函数
将事务中修改的每个块从缓冲区缓存复制到磁盘上日志槽位中。
// Copy modified blocks from cache to log.
static void
write_log(void)
{
int tail;
for (tail = 0; tail < log.lh.n; tail++) {
struct buf *to = bread(log.dev, log.start+tail+1); // log block
struct buf *from = bread(log.dev, log.lh.block[tail]); // cache block
memmove(to->data, from->data, BSIZE);
bwrite(to); // write the log
brelse(from);
brelse(to);
}
}
write_head()函数
将头块写入磁盘:这是提交点,写入后的崩溃将导致从日志恢复重演事务的写入操作。
static void
write_head(void)
{
struct buf *buf = bread(log.dev, log.start);
struct logheader *hb = (struct logheader *) (buf->data);
int i;
hb->n = log.lh.n;
for (i = 0; i < log.lh.n; i++) {
hb->block[i] = log.lh.block[i];
}
bwrite(buf);
brelse(buf);
}
write_head():把 log 块的头信息写入磁盘
log 块区域的第一块是 log 的 header 块
install_trans函数
// Copy committed blocks from log to their home location static void install_trans(int recovering) { int tail; for (tail = 0; tail < log.lh.n; tail++) { struct buf *lbuf = bread(log.dev, log.start+tail+1); // read log block struct buf *dbuf = bread(log.dev, log.lh.block[tail]); // read dst memmove(dbuf->data, lbuf->data, BSIZE); // copy block to dst bwrite(dbuf); // write dst to disk if(recovering == 0) bunpin(dbuf); brelse(lbuf); brelse(dbuf); } }
recover_from_log()函数
static void
recover_from_log(void)
{
read_head();
install_trans(1); // if committed, copy from log to disk
log.lh.n = 0;
write_head(); // clear the log
}
用于crash恢复
xv6的块分配器在磁盘上维护一个空闲位图,每一位代表一个块。0表示对应的块是空闲的;1表示它正在使用中。
程序mkfs设置对应于引导扇区、超级块、日志块、inode块和位图块的比特位。
balloc() 函数从磁盘中找一个空闲的磁盘块并返回(bitmap 中标记为 0)
bfree() 函数释放一个指定的磁盘块
两种含义
// On-disk inode structure
struct dinode {
short type; // File type
short major; // Major device number (T_DEVICE only)
short minor; // Minor device number (T_DEVICE only)
short nlink; // Number of links to inode in file system
uint size; // Size of file (bytes)
uint addrs[NDIRECT+1]; // Data block addresses
};
struct inode(kernel/file.h:17)是磁盘上struct dinode的内存副本。只有当有C指针引用某个inode时,内核才会在内存中存储该inode。
结构如下:
// in-memory copy of an inode
struct inode {
uint dev; // Device number
uint inum; // Inode number
int ref; // Reference count
struct sleeplock lock; // protects everything below here
int valid; // inode has been read from disk?
short type; // copy of disk inode
short major;
short minor;
short nlink;
uint size;
uint addrs[NDIRECT+1];
};
有 4 种锁或类锁的机制
iget() 返回一个 inode
ilock() 是加锁的,iget() 不加锁
inode cache 是 write-through(直写的),调用 iupdate() 直接反应到磁盘上
分配一个新的 inode 结点(新建文件等)的流程如下
目录的内部实现很像文件。
// kernel/fs.h
struct dirent {
ushort inum;
char name[DIRSIZ];
};
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。