文章目录

1. Buffer cache layer
- balloc
- Bfree
2. Inode Layer
- Ialloc&Iget
- void ilock(struct inode *ip)
- void iput(struct inode *ip)
- itrunc&iupdate
- static uint bmap(struct inode *ip, uint bn)
- readi
- writei
- stati
Directory Layer
- struct dirent
- dirlookup函数
- dirlink
File Descriptor Layer (file.c)
- struct file
- filealloc
- filedup
- fileclose
- filestat
- fileread
- filewrite
System calls sysfile.c
- sys_link
- Create函数

1. Buffer cache layer

balloc

要分配一个新的block，就需要根据bitmap了解block的free/allocated情况。因此，可以遍历data block，对于每一个data block查看是否是free，如果是就直接返回这个block就可以了。
在Xv6中，每个data block都去查找一次bitmap block这样效率其实很低，因为data block只是bitmap block的one bit，这时候如果正好把整个bitmap block的所有bits都检查一遍，效率很提升很多。
下面代码中，外层循环b表示一个data block，遍历每一个data block（从下标0开始），通过BBLOCK(b, sb)计算出data block b是bitmap block的第几个block。由于我们每次把一整个bitmap block都检查完，所以相当于一次检查了BPB(8 * 1024)个data block，所以外层循环每次b递增BPB。
BBLOCK的计算原理，之前在学校操作系统课上算的很溜，写代码这里有点愣了，正好复习一下。
如何计算一个No. B data block在Bitmap中是第几个block？
第一个Bitmap block起始位置的block + B
作为bitmap，一个block的大小是1024bytes，可以存储的bit有10248个block。因此B/(1024∗8)B / (1024*8)B/(1024∗8)就是应该在bitmap的哪一个block。
一个bitmap block可以有10248个bits就可以代表这么多的data block。1024*8在Xv6代码中是BPB = (BSIZE * 8)
读取到bp后，内层循环bi表示一个bitmap bit，每次递增1。
由于bread返回的是有lock的buffer，所以这里也会防止race condition导致的死锁。最后通过brelse释放锁。

// Allocate a zeroed disk block.
static uint
balloc(uint dev)
{int b, bi, m;struct buf *bp;bp = 0;for(b = 0; b < sb.size; b += BPB){bp = bread(dev, BBLOCK(b, sb));for(bi = 0; bi < BPB && b + bi < sb.size; bi++){m = 1 << (bi % 8);if((bp->data[bi/8] & m) == 0){  // Is block free?bp->data[bi/8] |= m;  // Mark block in use.log_write(bp);brelse(bp);bzero(dev, b + bi);return b + bi;}}brelse(bp);}panic("balloc: out of blocks");
}

Bfree

有了理解balloc的基础上，理解Bfree就更容易了。

读取对应bitmap block到buffer bp中
找到data block b在bitmap block中对应的bit bi。
使用mask方法把把bi对应的位置设置为0（free）。
调用log_write(bp)。
释放bp

// Free a disk block.
static void
bfree(int dev, uint b)
{struct buf *bp;int bi, m;bp = bread(dev, BBLOCK(b, sb));bi = b % BPB;m = 1 << (bi % 8);if((bp->data[bi/8] & m) == 0)panic("freeing free block");bp->data[bi/8] &= ~m;log_write(bp);brelse(bp);
}

2. Inode Layer

Ialloc&Iget

Ialloc的作用是分配一个inode，inode在disk中存储的方式也是block，一个block的大小为1024bytes，struct dinode(disk中的inode形式)为8字节，所以一个block能存放128个inode的信息。

// On-disk inode structure
struct dinode {short type;           // File typeshort major;          // Major device number (T_DEVICE only)short minor;          // Minor device number (T_DEVICE only)short nlink;          // Number of links to inode in file systemuint size;            // Size of file (bytes)uint addrs[NDIRECT+1];   // Data block addresses
};

要分配一个inode，需要遍历disk的inode所有节点，查看是否有一个inode是free状态。

首先从disk中读取对应inum所在的block，经过IBLOCK进行计算inum在disk中的block位置
在kernel获取到bp指针后，从bp指针的data中获取在一个block里面所有struct dinode，通过IPB计算出当前inum对应的inode应该在哪个位置，dip指针指向该位置。
如果发现dip->type == 0说明这是一个free inode，在返回这个inode之前，必须要将disk的struct dinode转换成内存中的struct inode这两个结构是不一样的，调用iget(uint dev, uint inum)方法，会寻找这个inode是否已经在内存的有cache了，如果没有cache当前inode，直接寻找一个free的inode cache，写入inode信息到cache的inode，包含dev，inum，ref。但是不要lock，不要从disk读取，这是iget特殊的地方。这里有一个注意的地方就是ip->valid = 0而不是1，这里在下面的ilock会用到这个valid来从硬盘读取inode的信息。

// Inodes per block.
#define IPB           (BSIZE / sizeof(struct dinode))
// Block containing inode i
#define IBLOCK(i, sb)     ((i) / IPB + sb.inodestart)Frans Kaashoek, 6 years ago: • Pick up where i left off in april:

// Allocate an inode on device dev.
// Mark it as allocated by  giving it type type.
// Returns an unlocked but allocated and referenced inode.
struct inode*
ialloc(uint dev, short type)
{int inum;struct buf *bp;struct dinode *dip;for(inum = 1; inum < sb.ninodes; inum++){bp = bread(dev, IBLOCK(inum, sb));dip = (struct dinode*)bp->data + inum%IPB;if(dip->type == 0){  // a free inodememset(dip, 0, sizeof(*dip));dip->type = type;log_write(bp);   // mark it allocated on the diskbrelse(bp);return iget(dev, inum);}brelse(bp);}panic("ialloc: no inodes");
}

// Find the inode with number inum on device dev
// and return the in-memory copy. Does not lock
// the inode and does not read it from disk.
static struct inode*
iget(uint dev, uint inum)
{struct inode *ip, *empty;acquire(&itable.lock);// Is the inode already in the table?empty = 0;for(ip = &itable.inode[0]; ip < &itable.inode[NINODE]; ip++){if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ip->ref++;release(&itable.lock);return ip;}if(empty == 0 && ip->ref == 0)    // Remember empty slot.empty = ip;}// Recycle an inode entry.if(empty == 0)panic("iget: no inodes");ip = empty;ip->dev = dev;ip->inum = inum;ip->ref = 1;ip->valid = 0;release(&itable.lock);return ip;
}

void ilock(struct inode *ip)

想要读取或者修改inode的任何数据，必须先调用ilock方法，确保只有一个进程可以访问一个in-memory inode。
ilock使用的是sleep lock。
如果inode->valid = 0，就需要从disk中读取信息。正好符合iget将inode->valid = 0match上。
解锁需要使用iunlock(struct inode *ip)

// Lock the given inode.
// Reads the inode from disk if necessary.
void
ilock(struct inode *ip)
{struct buf *bp;struct dinode *dip;if(ip == 0 || ip->ref < 1)panic("ilock");acquiresleep(&ip->lock);if(ip->valid == 0){bp = bread(ip->dev, IBLOCK(ip->inum, sb));dip = (struct dinode*)bp->data + ip->inum%IPB;ip->type = dip->type;ip->major = dip->major;ip->minor = dip->minor;ip->nlink = dip->nlink;ip->size = dip->size;memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));brelse(bp);ip->valid = 1;if(ip->type == 0)panic("ilock: no type");}
}

void iput(struct inode *ip)

函数作用是减少一个C指针对于inode的引用
如果当前引用是最后一个引用，那么内存的inode cache buffer就可以被回收给其他inode使用。并且如果没有links到当前inode，意味着当前inode和它的data block可以被free。Input调用itrunc将文件设置为0字节，并且释放data blocks，设置inode type=0，最后把inode写会disk。
iput的locking原则，讨论并发情况下是否iput存在死锁等问题，
第一种设想的危险情景是一个并发线程等待ilock来使用当前inode，但是当前inode即将要被free。这种情景其实是不存在的，在input函数中，可以进入if条件要free当前inode的情况必须是ip->ref == 1以及ip->nlink == 0，这个最后的引用就是当前调用iput的进程，并且nlink=0表示没有其他进程要获取当前inode。

疑问：是否有可能进入if条件后，但是在执行获取ip->lock之前另一个进程修改了nlink呢？
第二种设想的危险是：一个并发调用ialloc，ialloc选取了和iput正在free的inode一样的inode（因为ip->type = 0在iput中设置了，所以ialloc可能获取到当前inode）。尽管ialloc可以获取到这个inode，但是这个过程肯定是在ip-type = 0执行后，在releasesleep(&ip->lock)之前执行，ialloc获取的inode才能被修改和读取（读取或者修改一个inode必须调用ilock）。
iput和crashes是值得考虑的另一个问题，如果iput不会立刻调用itrucate去删除文件内容，如果内存中的另一个进程仍旧引用当前inode。但是如果突然电脑crash，这个inode却没有被free，也没有写入disk，这时候crash的状态就是inode在disk中nlink是0，但是没有free掉。
文件系统通常采用两种方式在重启电脑后恢复正常，

最简单的方法：扫描整个文件系统，files被标记为allocated，但是没有dir引用这个文件，那么这个文件直接free就可以了。
第二种方法不需要扫描整个文件系统，文件系统会在super blcok中记录link count是0，但是refcout不为0的inode number，形式为一个list。如果文件系统成功在refcount为0的时候删除文件内容了，那么就更新disk中的这个记录，把这个inode number从list移除。如果crashes了，系统重启后会释放所有在list中的文件。
疑问：
这里提到不会产生dealock的原因不是很明白，为何icache的lock和ip的sleeplock会导致死锁，难道是有的进程会获取ip的sleeplock后再去获取icache的lock？形成race conditon？目前还没找到对应的代码，等到lab再看看。

// Drop a reference to an in-memory inode.
// If that was the last reference, the inode table entry can
// be recycled.
// If that was the last reference and the inode has no links
// to it, free the inode (and its content) on disk.
// All calls to iput() must be inside a transaction in
// case it has to free the inode.
void
iput(struct inode *ip)
{acquire(&itable.lock);if(ip->ref == 1 && ip->valid && ip->nlink == 0){// inode has no links and no other references: truncate and free.// ip->ref == 1 means no other process can have ip locked,// so this acquiresleep() won't block (or deadlock).acquiresleep(&ip->lock);release(&itable.lock);itrunc(ip);ip->type = 0;iupdate(ip);ip->valid = 0;releasesleep(&ip->lock);acquire(&itable.lock);}ip->ref--;release(&itable.lock);
}

itrunc&iupdate

itrunc函数仅仅用来删除一个inode节点指向的所有data block，第一个for循环遍历0到11，一共12个direct引用data block，并且完成free。
判断二层引用data block是否存在，如果存在依旧是free。最后设置ip size为0，调用iupdate写入disk。

// Truncate inode (discard contents).
// Caller must hold ip->lock.
void
itrunc(struct inode *ip)
{int i, j;struct buf *bp;uint *a;for(i = 0; i < NDIRECT; i++){if(ip->addrs[i]){bfree(ip->dev, ip->addrs[i]);ip->addrs[i] = 0;}}if(ip->addrs[NDIRECT]){bp = bread(ip->dev, ip->addrs[NDIRECT]);a = (uint*)bp->data;for(j = 0; j < NINDIRECT; j++){if(a[j])bfree(ip->dev, a[j]);}brelse(bp);bfree(ip->dev, ip->addrs[NDIRECT]);ip->addrs[NDIRECT] = 0;}ip->size = 0;iupdate(ip);
}

// Copy a modified in-memory inode to disk.
// Must be called after every change to an ip->xxx field
// that lives on disk.
// Caller must hold ip->lock.
void
iupdate(struct inode *ip)
{struct buf *bp;struct dinode *dip;bp = bread(ip->dev, IBLOCK(ip->inum, sb));dip = (struct dinode*)bp->data + ip->inum%IPB;dip->type = ip->type;dip->major = ip->major;dip->minor = ip->minor;dip->nlink = ip->nlink;dip->size = ip->size;memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));log_write(bp);brelse(bp);
}

static uint bmap(struct inode *ip, uint bn)

bmap的作用是获取指定inode的第几个block的数据的地址。如果对应的block不存在就分配所需要的block，如果indirect的block也不存在也分配。
bmap首先判断bn是在0到11还是在indirect block。

// Inode content
//
// The content (data) associated with each inode is stored
// in blocks on the disk. The first NDIRECT block numbers
// are listed in ip->addrs[].  The next NINDIRECT blocks are
// listed in block ip->addrs[NDIRECT].
// Return the disk block address of the nth block in inode ip.
// If there is no such block, bmap allocates one.
static uint
bmap(struct inode *ip, uint bn)
{uint addr, *a;struct buf *bp;if(bn < NDIRECT){if((addr = ip->addrs[bn]) == 0)ip->addrs[bn] = addr = balloc(ip->dev);return addr;}bn -= NDIRECT;if(bn < NINDIRECT){// Load indirect block, allocating if necessary.if((addr = ip->addrs[NDIRECT]) == 0)ip->addrs[NDIRECT] = addr = balloc(ip->dev);bp = bread(ip->dev, addr);a = (uint*)bp->data;if((addr = a[bn]) == 0){a[bn] = addr = balloc(ip->dev);log_write(bp);}brelse(bp);return addr;}panic("bmap: out of range");
}

readi

readi函数用于读取inode中大小为n的数据，读取的数据在inode中的位置从inode的off开始到off+n位置（等会这些地址会被转换成对应的block）。所有的数据都写入dst即可。
读取的数据保存到传入的指针dst调用者必须对ip上锁。
readi对bmap进行封装，调用readi的函数只需要提供地址就可以，而不需要再去关注具体的inode实现细节。
readi首先确保offset（起始位置）和offset+n（终点）位置的大小是有效的。
for循环完成数据拷贝到dst地址的工作，先查找到对应的数据block，开始拷贝数据到对应的地址，由于是从kernel的数据拷贝到dst，所以使用either_copyout（拷贝内存中的数据到用户地址或者内核地址）。

// Read data from inode.
// Caller must hold ip->lock.
// If user_dst==1, then dst is a user virtual address;
// otherwise, dst is a kernel address.
int
readi(struct inode *ip, int user_dst, uint64 dst, uint off, uint n)
{uint tot, m;struct buf *bp;if(off > ip->size || off + n < off) // 判断读取范围是否off是否超过了ip的大小return 0;if(off + n > ip->size)n = ip->size - off;for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ // 根据地址来转换对应的block是第几个，再去从ip中获取对应的blockbp = bread(ip->dev, bmap(ip, off/BSIZE)); // 获取到inode ip所在的block，bmap可以获取到ip的第n个block的disk block地址。bread根据block number返回disk中的block，返回的是缓存在内存中的指针 struct buf *bm = min(n - tot, BSIZE - off%BSIZE);if(either_copyout(user_dst, dst, bp->data + (off % BSIZE), m) == -1) { // 读取block数据到内存dstbrelse(bp);tot = -1; // 返回-1表示读取失败break;}brelse(bp);}return tot; // 返回dst读取的数据数量
}

writei

该函数和readi基本上是一样的，只不过writei将内存的数据从对应的地址写入到block中。
writei将从内存地址src位置开始大小为n的数据写入内存中的inode从off到off+n的位置（会被转换成对应的block在inode中）。

如果文件大小需要增大，bmap会完成这个任务。但是upbound仍旧是文件的最大大小256+12。
for遍历会完成数据的拷贝到file的block中。
最后如果文件大小增大了，那么ip->size也要对应增大。
Readi和writei都需要检查ip的type是不是T_DEV，这是用来处理有些文件的数据不在文件系统中。这个会在文件描述符层讨论。

// Write data to inode.
// Caller must hold ip->lock.
// If user_src==1, then src is a user virtual address;
// otherwise, src is a kernel address.
// Returns the number of bytes successfully written.
// If the return value is less than the requested n,
// there was an error of some kind.
int
writei(struct inode *ip, int user_src, uint64 src, uint off, uint n)
{uint tot, m;struct buf *bp;if(off > ip->size || off + n < off) // 查看写入到ip的地址是否超过了ip的大小范围return -1;if(off + n > MAXFILE*BSIZE) // 查看是否上界超过了xv6规定的文件大小return -1;for(tot=0; tot<n; tot+=m, off+=m, src+=m){ // tot记录写入的字节数bp = bread(ip->dev, bmap(ip, off/BSIZE)); // 获取到off对应的ip inode blockm = min(n - tot, BSIZE - off%BSIZE); // 判断需要写入的字节数量是否满足一个block？if(either_copyin(bp->data + (off % BSIZE), user_src, src, m) == -1) { // 写入数据到ip 对应的block bp中，地址要算好offsetbrelse(bp);break;}log_write(bp); // 将block写入硬盘，但是注意ip并没有更新brelse(bp);}if(off > ip->size) // 如果写入的位置，超过原本ip长度，但没有超过xv6最大文件大小ip->size = off; // 更新ip的大小// write the i-node back to disk even if the size didn't change// because the loop above might have called bmap() and added a new// block to ip->addrs[].iupdate(ip); // 更新ip节点，刚才log_write只更新了block的。return tot; // 返回总共写入的字节
}

stati

这个函数用于拷贝inode的metadata到stat结构中，stat是暴露给user program调用stat系统调用。

// Copy stat information from inode.
// Caller must hold ip->lock.
void
stati(struct inode *ip, struct stat *st)
{st->dev = ip->dev;st->ino = ip->inum;st->type = ip->type;st->nlink = ip->nlink;st->size = ip->size;
}

Directory Layer

struct dirent

一个目录内部实现形式和文件很相似，它的inode的type是T_DIR，data部分是一系列的directory entries. 每个entry是一个struct dirent结构，这个结构包含了一个inum和name，inode number是0表示free，name最多支持14个字符，如果少于14字符，用NUL（0）结尾。

// fs.h:56
// https://github.com/mit-pdos/xv6-riscv/blob/riscv//kernel/fs.h#L56
// Directory is a file containing a sequence of dirent structures.
#define DIRSIZ 14
struct dirent {ushort inum;char name[DIRSIZ];
};

dirlookup函数

该函数接受一个要查找的目录名char * name 以及要在哪个目录下struct inode *dp查找这个目录（name）。如果找到了目录（name），那么返回一个inode指针指向这个目录inode节点，并且把传入的uint *poff设置成dp目录到name目录的位置字节偏移量，方便调用者修改查找到的目录。
值得注意的是Dirlookup函数返回的inode是unlocked，这个inode是由iget函数获取到的。Dirlookup为什么不返回locked inode呢？因为调用dirlookup的caller已经lock up了dp，才能传入dp给dirlookup。如果在dp当前目录下查找目录，找到的结果要lock，必须要先lock parent，但是由于parent已经lock了，再次重复lock parent dp，就导致了死锁。因此交由dirlookup的caller来unlock dp 和lock ip是保证只上锁一次的关键点。

// Look for a directory entry in a directory.
// If found, set *poff to byte offset of entry.
struct inode*
dirlookup(struct inode *dp, char *name, uint *poff)
{uint off, inum;struct dirent de;if(dp->type != T_DIR) // 判断是否是给的目录，再去查找name目录panic("dirlookup not DIR");for(off = 0; off < dp->size; off += sizeof(de)){ // 遍历目录的内容if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) // 读取目录到de中panic("dirlookup read");if(de.inum == 0) // 读取到的内容是free的，继续后面的查找continue;if(namecmp(name, de.name) == 0){ // 比较名字是不是相同// entry matches path elementif(poff) // 判断传入指针是否为空，不为空才设置offset*poff = off;inum = de.inum; // 获取inumreturn iget(dp->dev, inum); // 通过iget，读取到内存 inode table中，valid=0}}return 0; // 返回0，表示没有在dp目录下查找到相对应name的目录
}

dirlink

int dirlink(struct inode *dp, char *name, uint inum)用来在dp目录下，创建一个新的名为name目录。
首先查找是否dp目录下已经存在name的目录，如果存在name目录，调用iput函数对刚才调用dirlookup新增的引用删除掉，并且返回-1表示目录已经存在。
如果没有存在name目录，继续后续的步骤。
由于到这里已经确定了目录下不存在name目录，寻找一个空的dirent节点，遍历dp，这一步骤类似于dirlookup的遍历，找到一个de.inum == 0的节点就可以退出循环。
下面将传入的name和inum写入找到的empty dirent中。并且调用writei将新增的name目录写入到dp目录中，并且dp写入disk更新目录disk image。

// Write a new directory entry (name, inum) into the directory dp.
int
dirlink(struct inode *dp, char *name, uint inum)
{int off;struct dirent de;struct inode *ip;// Check that name is not present.if((ip = dirlookup(dp, name, 0)) != 0){iput(ip); // 文件存在，要把ip引用去掉。return -1;}// Look for an empty dirent.for(off = 0; off < dp->size; off += sizeof(de)){if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de))panic("dirlink read");if(de.inum == 0) // 查找到了empty的break;}strncpy(de.name, name, DIRSIZ); // 拷贝目录名字namede.inum = inum; // 拷贝inum到新的empty direntif(writei(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) // de的内容写入dp中，根据offsetpanic("dirlink");return 0;
}

File Descriptor Layer (file.c)

Unix将大多数资源抽象成file文件，包括console，pipe和真实的文件。实现这种抽象的代码就是靠File descriptor layer。
xv6 给每个进程提供了一个file descriptor表。每一个open file都是用struct file表示的。open方法就是创建一个新的open file，也就是一个in-memory struct file。
不同进程可以独立的打开相同文件，并且拥有这个文件不同的offset。
一个open file（struct file）可以出现在一个process的file table多次，例如一个进程调用dup创建aliases，或者调用fork分享给child process。struct file的ref会记录一个file的引用次数。文件open后支持read，write。struct file的readable, writable表示文件是否可读，可写。

struct file

struct file {enum { FD_NONE, FD_PIPE, FD_INODE, FD_DEVICE } type;int ref; // reference countchar readable;char writable;struct pipe *pipe; // FD_PIPEstruct inode *ip;  // FD_INODE and FD_DEVICEuint off;          // FD_INODEshort major;       // FD_DEVICE
};

所有打开的文件都会保存在一个全局global的file table（ftable）。这个文件表有支持allocate a file的函数filealoc，创建重复引用的filedup，释放一个引用的fileclose，和读取数据的fileread，写入数据的filewrite。

struct {struct spinlock lock;struct file file[NFILE];
} ftable;

filealloc

filealloc分配一个struct file，返回分配到的struct file *f指针。
步骤：

获取ftable的锁
遍历ftable的struct file数组，找到引用是0的一个file结构，更改ref为1，释放ftable锁。返回f。
如果遍历后，没找到释放ftable锁，返回0表示没有找到文件。

// Allocate a file structure.
struct file*
filealloc(void)
{struct file *f;acquire(&ftable.lock);for(f = ftable.file; f < ftable.file + NFILE; f++){if(f->ref == 0){ // 找到空的file结构f->ref = 1;release(&ftable.lock);return f;}}release(&ftable.lock); // 没有找到空的file结构，无法分配，释放锁return 0; // 返回0表示没有找到空的
}

filedup

filedup用来增加对一个file f的refcount。由于file结构本身没有锁，所以想要修改单独一个file结构需要锁整个file table。

// Increment ref count for file f.
struct file*
filedup(struct file *f)
{acquire(&ftable.lock);if(f->ref < 1)panic("filedup");f->ref++;release(&ftable.lock);return f;
}

fileclose

fileclose函数，用来减少一个文件的refcount。
如果refcount-1后仍旧大于0，那么说明还有别的FD引用这个文件，没必要修改，直接返回。
如果refcount-1后为0，需要对struct file *f进行清空方便后续复用ftable的这个结构数组元素。这时候可以通过ff来引用f的地址，从而提前释放锁。
如果是pipe，就关闭pipe。
如果是struct file的type是inode或者device，那么就调用iput减少对该节点的引用。

// Close file f.  (Decrement ref count, close when reaches 0.)
void
fileclose(struct file *f)
{struct file ff;acquire(&ftable.lock);if(f->ref < 1)panic("fileclose");if(--f->ref > 0){release(&ftable.lock);return;}ff = *f;f->ref = 0;f->type = FD_NONE;release(&ftable.lock);if(ff.type == FD_PIPE){pipeclose(ff.pipe, ff.writable);} else if(ff.type == FD_INODE || ff.type == FD_DEVICE){begin_op();iput(ff.ip);end_op();}
}

filestat

用来获取文件f的数据到addr中，addr指向一个struct stat结构。
这个函数只对INODE和DEVICE有效，因为底层调用的是stati。
调用完stati获取到数据后，将数据传入用户空间调用copyout。

// Get metadata about file f.
// addr is a user virtual address, pointing to a struct stat.
int
filestat(struct file *f, uint64 addr)
{struct proc *p = myproc();struct stat st;if(f->type == FD_INODE || f->type == FD_DEVICE){ilock(f->ip);stati(f->ip, &st);iunlock(f->ip);if(copyout(p->pagetable, addr, (char *)&st, sizeof(st)) < 0)return -1;return 0;}return -1;
}

fileread

判断文件的类型，如果是Inode，需要先获取锁，再调用readi直接读取inode的off地址到off+n的内容。并且更新struct file *f的offset，再释放锁。这样可以保证file结构offset读写都是正确的，防止其他进程也同时修改offset造成文件的覆盖。

// Read from file f.
// addr is a user virtual address.
int
fileread(struct file *f, uint64 addr, int n)
{int r = 0;if(f->readable == 0)return -1;if(f->type == FD_PIPE){r = piperead(f->pipe, addr, n);} else if(f->type == FD_DEVICE){if(f->major < 0 || f->major >= NDEV || !devsw[f->major].read)return -1;r = devsw[f->major].read(1, addr, n);} else if(f->type == FD_INODE){ilock(f->ip);if((r = readi(f->ip, 1, addr, f->off, n)) > 0)f->off += r;iunlock(f->ip);} else {panic("fileread");}return r;
}

filewrite

filewrite用来写入虚拟地址addr的内容到file f中。
对于INODE来说，由于log一次transaction的文件是有block上限，要先判断是否超过上限。

// Write to file f.
// addr is a user virtual address.
int
filewrite(struct file *f, uint64 addr, int n)
{int r, ret = 0;if(f->writable == 0)return -1;if(f->type == FD_PIPE){ret = pipewrite(f->pipe, addr, n);} else if(f->type == FD_DEVICE){if(f->major < 0 || f->major >= NDEV || !devsw[f->major].write)return -1;ret = devsw[f->major].write(1, addr, n);} else if(f->type == FD_INODE){// write a few blocks at a time to avoid exceeding// the maximum log transaction size, including// i-node, indirect block, allocation blocks,// and 2 blocks of slop for non-aligned writes.// this really belongs lower down, since writei()// might be writing a device like the console.int max = ((MAXOPBLOCKS-1-1-2) / 2) * BSIZE; // 计算最多一次transaction可以有多少长度地址的内容int i = 0;while(i < n){int n1 = n - i; if(n1 > max) // 如果超过最大transaction的bytesn1 = max;begin_op();ilock(f->ip);if ((r = writei(f->ip, 1, addr + i, f->off, n1)) > 0)f->off += r; // 更新offsetiunlock(f->ip);end_op();if(r != n1){// error from writeibreak;}i += r; // 更新已经写入的字节}ret = (i == n ? n : -1);} else {panic("filewrite");}return ret;
}

System calls sysfile.c

sys_link

sys_link用来创建一个新的名字给原本存在的一个inode，相当于创建一个别名。
步骤：

获取系统调用传入的参数，使用argstr获取old和new两个路径。默认是默认不是目录。
开启transaction
调用namei根据，old名查找对应的inode，old必须不是一个目录
找到inode后对它的nlink++
接下来要对于new所在的目录进行更改，创建一个新的directory entry给new。调用nameiparent，查找到new路径的parent的inode，添加directory entry。
new的parent directory必须存在，并且和old的inode必须要在同一个设备上：这是确保inode numbers在单一的disk上只有唯一的意思。
调用dirlink，新添加一个目录entry到new的parent dir inode下。
结束transaction
transaction优化了很多步骤，我们在没有创建link之前可以对ip->link更新，其实会导致系统处于不安全状态，但是有了transaction可以一次完成所有步骤，无所谓顺序。

// Create the path new as a link to the same inode as old.
uint64
sys_link(void)
{char name[DIRSIZ], new[MAXPATH], old[MAXPATH];struct inode *dp, *ip;if(argstr(0, old, MAXPATH) < 0 || argstr(1, new, MAXPATH) < 0)return -1;begin_op();if((ip = namei(old)) == 0){end_op();return -1;}ilock(ip);if(ip->type == T_DIR){iunlockput(ip);end_op();return -1;}ip->nlink++;iupdate(ip);iunlock(ip);if((dp = nameiparent(new, name)) == 0)goto bad;ilock(dp);if(dp->dev != ip->dev || dirlink(dp, name, ip->inum) < 0){iunlockput(dp);goto bad;}iunlockput(dp);iput(ip);end_op();return 0;
bad:ilock(ip);ip->nlink--;iupdate(ip);iunlockput(ip);end_op();return -1;
}

Create函数

sys_link是为一个已经存在的inode创建一个新的名字，而create函数是为一个是新分配的inode创建一个新的名字。
create函数可以给三种文件创建系统调用使用：

open with O_CREATE flag makes a new ordinary file
mkdir创建一个新的目录
mkdev床见一个新的device file

create函数逻辑：

create函数首先调用nameiparent查找当前path的父级目录，nameiparent返回父级目录的inode和final path element into name（目录的最后名字吧？14大小只有）
根据第一步找到的父级目录，对父级目录inode dp上锁，查找父级目录dp是否包含name已经存在，新的一个名字（推测上面的name应该是一个新的name）。
如果name，已经存在，并且是由函数open调用的，也就是type == T_FILE，并且inode也是个file，那么open就认为成功了，返回就可以。
如果name，没有存在，分配一个新的inode。如果新的inode是一个目录（mkdir调用），那么还需要初始化.和..。最后调用dirlink和父级目录建立关系。

关于锁的描述，reading写得很清楚了。

Create, like sys_link, holds two inode locks simultaneously: ip and dp. There is no possibility of deadlock because the inode ip is freshly allocated: no other process in the system will hold ip ’s lock and then try to lock dp.

static struct inode*
create(char *path, short type, short major, short minor)
{struct inode *ip, *dp;char name[DIRSIZ];if((dp = nameiparent(path, name)) == 0)return 0;ilock(dp);if((ip = dirlookup(dp, name, 0)) != 0){iunlockput(dp);ilock(ip);if(type == T_FILE && (ip->type == T_FILE || ip->type == T_DEVICE))return ip;iunlockput(ip);return 0;}if((ip = ialloc(dp->dev, type)) == 0)panic("create: ialloc");ilock(ip);ip->major = major;ip->minor = minor;ip->nlink = 1;iupdate(ip);if(type == T_DIR){  // Create . and .. entries.dp->nlink++;  // for ".."iupdate(dp);// No ip->nlink++ for ".": avoid cyclic ref count.if(dirlink(ip, ".", ip->inum) < 0 || dirlink(ip, "..", dp->inum) < 0)panic("create dots");}if(dirlink(dp, name, ip->inum) < 0)panic("create: dirlink");iunlockput(dp);return ip;
}
```**

MIT-6.s081-CodeWalk-fs.cfile.csysfile.c相关推荐

MIT 6.s081学习笔记
MIT 6.s081学习笔记 introduction 计算机组织结构: 最底部是一些硬件资源,包括了CPU,内存,磁盘,网卡最上层会运行各种应用程序,比如vim,shell等,这些就是正在运行的所 ...
xv6 6.S081 Lab8: fs
xv6 6.S081 Lab8: fs 写在前面实验介绍开始! Large File Symbolic links fs代码在这里.我的妈呀,终于要写完了,xv6的file system考察难度并 ...
MIT 6.S081 Lab4 traps
#Lab4: traps #Source #My Code #Motivation #Backtrace (moderate) #Motivation #Solution #S0 - RISC-V 栈 ...
「实验记录」MIT 6.S081 Lab7 multithreading
#Lab7: multithreading I. Source II. My Code III. Motivation IV. Uthread: switching between threads ( ...
操作系统实验Lab 1:Xv6 and Unix utilities(MIT 6.S081 FALL 2020)
Lab 1 Xv6 and Unix utilities 实验要求链接 Boot xv6 (easy) 实验目的切换到 xv6-labs-2020 代码的 util 分支,并利用 QEMU 模拟器启 ...
MIT 6.S081 聊聊xv6中的文件系统（上）
Lab一做一晚上,blog一写能写两天,比做Lab的时间还长( 这篇博文是半夜才写完的,本来打算写完后立刻发出来,但由于今天发现白天发博点击量会高点,就睡了一觉后才发(几十的点击量也是点击量啊T_T) ...
MIT 6.S081 lab 11：Networking
背景在你开始写代码前,你可能会发现xv6 book中的第五章:中断和设备驱动是很有帮助的. 你将使用一个叫做E1000的网络设备来处理网络通信.对于xv6(以及你写的驱动),E1000看起来像一 ...
MIT 6.S081 实验5 笔记与心得
Lab 5:Lazy 文章目录 Lab 5:Lazy 前期准备 Eliminate allocation from sbrk() ([easy](https://pdos.csail.mit.edu/ ...
MIT 6.S081 lab 5：lazy page allocation
1 Lab lab 5就是去实现xv6 book 4.6中写的 Lazy page allocation 有个问题:page fault的trap是如何出现的? 1.1 Eliminate alloc ...

MIT-6.s081-CodeWalk-fs.cfile.csysfile.c