Linux那些事儿之我是Sysfs(7)dentry与inode

我们在进程中要怎样去描述一个文件呢？我们用目录项(dentry)和索引节点(inode)。它们的定义如下:

include/linux/dcache.h

struct dentry {
/* RCU lookup touched fields */
unsigned int d_flags;       /* protected by d_lock */
seqcount_t d_seq;       /* per dentry seqlock */
struct hlist_bl_node d_hash;    /* lookup hash list */
struct dentry *d_parent;    /* parent directory */
struct qstr d_name;
struct inode *d_inode;      /* Where the name belongs to - NULL is
* negative */
unsigned char d_iname[DNAME_INLINE_LEN];    /* small names */
/* Ref lookup also touches following */
struct lockref d_lockref;   /* per-dentry lock and refcount */
const struct dentry_operations *d_op;
struct super_block *d_sb;   /* The root of the dentry tree */
unsigned long d_time;       /* used by d_revalidate */
void *d_fsdata;         /* fs-specific data */
struct list_head d_lru;     /* LRU list */
struct list_head d_child;   /* child of parent list */
struct list_head d_subdirs; /* our children */
/*
* d_alias and d_rcu can share memory
*/
union {
struct hlist_node d_alias;  /* inode alias list */
struct rcu_head d_rcu;
} d_u;
};

include/linux/fs.h

struct inode {
umode_t         i_mode;
unsigned short      i_opflags;
kuid_t          i_uid;
kgid_t          i_gid;
unsigned int        i_flags;
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl    *i_acl;
struct posix_acl    *i_default_acl;
#endif
const struct inode_operations   *i_op;
struct super_block  *i_sb;
struct address_space    *i_mapping;
#ifdef CONFIG_SECURITY
void            *i_security;
#endif
/* Stat data, not accessed from path walking */
unsigned long       i_ino;
/*
* Filesystems may only read i_nlink directly.  They shall use the
* following functions for modification:
*
*    (set|clear|inc|drop)_nlink
*    inode_(inc|dec)_link_count
*/
union {
const unsigned int i_nlink;
unsigned int __i_nlink;
};
dev_t           i_rdev;
loff_t          i_size;
struct timespec     i_atime;
struct timespec     i_mtime;
struct timespec     i_ctime;
spinlock_t      i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short          i_bytes;
unsigned int        i_blkbits;
blkcnt_t        i_blocks;
#ifdef __NEED_I_SIZE_ORDERED
seqcount_t      i_size_seqcount;
#endif
/* Misc */
unsigned long       i_state;
struct mutex        i_mutex;
unsigned long       dirtied_when;   /* jiffies of first dirtying */
struct hlist_node   i_hash;
struct list_head    i_wb_list;  /* backing dev IO list */
struct list_head    i_lru;      /* inode LRU list */
struct list_head    i_sb_list;
union {
struct hlist_head   i_dentry;
struct rcu_head     i_rcu;
};
u64         i_version;
atomic_t        i_count;
atomic_t        i_dio_count;
atomic_t        i_writecount;
#ifdef CONFIG_IMA
atomic_t        i_readcount; /* struct files open RO */
#endif
const struct file_operations    *i_fop; /* former ->i_op->default_file_ops */
struct file_lock    *i_flock;
struct address_space    i_data;
#ifdef CONFIG_QUOTA
struct dquot        *i_dquot[MAXQUOTAS];
#endif
struct list_head    i_devices;
union {
struct pipe_inode_info  *i_pipe;
struct block_device *i_bdev;
struct cdev     *i_cdev;
};
__u32           i_generation;
#ifdef CONFIG_FSNOTIFY
__u32           i_fsnotify_mask; /* all events this inode cares about */
struct hlist_head   i_fsnotify_marks;
#endif
void            *i_private; /* fs or device private pointer */
};

所谓"文件", 就是按一定的形式存储在介质上的信息，所以一个文件其实包含了两方面的信息，一是存储的数据本身，二是有关该文件的组织和管理的信息。在内存中, 每个文件都有一个dentry(目录项)和inode(索引节点)结构，dentry记录着文件名，上级目录等信息，正是它形成了我们所看到的树状结构；而有关该文件的组织和管理的信息主要存放inode里面，它记录着文件在存储介质上的位置与分布。同时dentry->d_inode指向相应的inode结构。dentry与inode是多对一的关系，因为有可能一个文件有好几个文件名(硬链接, hard link, 可以参考这个网页 http://www.ugrad.cs.ubc.ca/~cs219/CourseNotes/Unix/commands-links.html)。

所有的dentry用d_parent和d_child连接起来，就形成了我们熟悉的树状结构。

inode代表的是物理意义上的文件，通过inode可以得到一个数组，这个数组记录了文件内容的位置，如该文件位于硬盘的第3，8，10块，那么这个数组的内容就是3,8,10。其索引节点号inode->i_ino，在同一个文件系统中是唯一的，内核只要根据i_ino，就可以计算出它对应的inode在介质上的位置。就硬盘来说，根据i_ino就可以计算出它对应的inode属于哪个块(block)，从而找到相应的inode结构。但仅仅用inode还是无法描述出所有的文件系统，对于某一种特定的文件系统而言，比如ext3，在内存中用ext3_inode_info描述。他是一个包含inode的"容器"。

fs/ext3/ext3.h

struct ext3_inode_info {
__le32  i_data[15]; /* unconverted */
__u32   i_flags;
#ifdef EXT3_FRAGMENTS
__u32   i_faddr;
__u8    i_frag_no;
__u8    i_frag_size;
#endif
ext3_fsblk_t    i_file_acl;
__u32   i_dir_acl;
__u32   i_dtime;
/*
* i_block_group is the number of the block group which contains
* this file's inode.  Constant across the lifetime of the inode,
* it is ued for making block allocation decisions - we try to
* place a file's data blocks near its inode block, and new inodes
* near to their parent directory's inode.
*/
__u32   i_block_group;
unsigned long   i_state_flags;  /* Dynamic state flags for ext3 */
/* block reservation info */
struct ext3_block_alloc_info *i_block_alloc_info;
__u32   i_dir_start_lookup;
#ifdef CONFIG_EXT3_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_mutex even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
*/
struct rw_semaphore xattr_sem;
#endif
struct list_head i_orphan;  /* unlinked but open inodes */
/*
* i_disksize keeps track of what the inode size is ON DISK, not
* in memory.  During truncate, i_size is set to the new size by
* the VFS prior to calling ext3_truncate(), but the filesystem won't
* set i_disksize to 0 until the truncate is actually under way.
*
* The intent is that i_disksize always represents the blocks which
* are used by this file.  This allows recovery to restart truncate
* on orphans if we crash during truncate.  We actually write i_disksize
* into the on-disk inode when writing inodes out, instead of i_size.
*
* The only time when i_disksize and i_size may be different is when
* a truncate is in progress.  The only things which change i_disksize
* are ext3_get_block (growth) and ext3_truncate (shrinkth).
*/
loff_t  i_disksize;
/* on-disk additional length */
__u16 i_extra_isize;
/*
* truncate_mutex is for serialising ext3_truncate() against
* ext3_getblock().  In the 2.4 ext2 design, great chunks of inode's
* data tree are chopped off during truncate. We can't do that in
* ext3 because whenever we perform intermediate commits during
* truncate, the inode and all the metadata blocks *must* be in a
* consistent state which allows truncation of the orphans to restart
* during recovery.  Hence we must fix the get_block-vs-truncate race
* by other means, so we have truncate_mutex.
*/
struct mutex truncate_mutex;
/*
* Transactions that contain inode's metadata needed to complete
* fsync and fdatasync, respectively.
*/
atomic_t i_sync_tid;
atomic_t i_datasync_tid;
struct inode vfs_inode;
};

__le32 i_data[15]这个数组就是上一段中所提到的那个数组。

注意，在遥远的2.4的古代，不同文件系统索引节点的内存映像(ext3_inode_info，reiserfs_inode_info，msdos_inode_info ...)都是用一个union内嵌在inode数据结构中的. 但inode作为一种非常基本的数据结构而言，这样搞太大了，不利于快速的分配和回收。但是后来发明了container_of(...)这种方法后，就把union移到了外部，我们可以用类似container_of(inode, struct ext3_inode_info, vfs_inode)，从inode出发，得到其的"容器"。

dentry和inode终究都是在内存中的，它们的原始信息必须要有一个载体。否则断电之后岂不是玩完了？且听我慢慢道来。

文件可以分为磁盘文件，设备文件，和特殊文件三种。设备文件暂且不表。

磁盘文件
就磁盘文件而言，dentry和inode的载体在存储介质(磁盘)上。对于像ext3这样的磁盘文件来说，存储介质中的目录项和索引节点载体如下，

fs/ext3/ext3.h

struct ext3_inode {
__le16  i_mode;     /* File mode */
__le16  i_uid;      /* Low 16 bits of Owner Uid */
__le32  i_size;     /* Size in bytes */
__le32  i_atime;    /* Access time */
__le32  i_ctime;    /* Creation time */
__le32  i_mtime;    /* Modification time */
__le32  i_dtime;    /* Deletion Time */
__le16  i_gid;      /* Low 16 bits of Group Id */
__le16  i_links_count;  /* Links count */
__le32  i_blocks;   /* Blocks count */
__le32  i_flags;    /* File flags */
union {
struct {
__u32  l_i_reserved1;
} linux1;
struct {
__u32  h_i_translator;
} hurd1;
struct {
__u32  m_i_reserved1;
} masix1;
} osd1;             /* OS dependent 1 */
__le32  i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
__le32  i_generation;   /* File version (for NFS) */
__le32  i_file_acl; /* File ACL */
__le32  i_dir_acl;  /* Directory ACL */
__le32  i_faddr;    /* Fragment address */
union {
struct {
__u8    l_i_frag;   /* Fragment number */
__u8    l_i_fsize;  /* Fragment size */
__u16   i_pad1;
__le16  l_i_uid_high;   /* these 2 fields    */
__le16  l_i_gid_high;   /* were reserved2[0] */
__u32   l_i_reserved2;
} linux2;
struct {
__u8    h_i_frag;   /* Fragment number */
__u8    h_i_fsize;  /* Fragment size */
__u16   h_i_mode_high;
__u16   h_i_uid_high;
__u16   h_i_gid_high;
__u32   h_i_author;
} hurd2;
struct {
__u8    m_i_frag;   /* Fragment number */
__u8    m_i_fsize;  /* Fragment size */
__u16   m_pad1;
__u32   m_i_reserved2[2];
} masix2;
} osd2;             /* OS dependent 2 */
__le16  i_extra_isize;
__le16  i_pad1;
};

fs/ext3/ext3.h

struct ext3_dir_entry_2 {
__le32  inode;          /* Inode number */
__le16  rec_len;        /* Directory entry length */
__u8    name_len;       /* Name length */
__u8    file_type;
char    name[EXT3_NAME_LEN];    /* File name */
};

__le32 i_block[EXT3_N_BLOCKS];

i_block数组指示了文件的内容所存放的地点(在硬盘上的位置)。

ext3_inode是放在索引节点区，而ext3_dir_entry_2是以文件内容的形式存放在数据区。我们只要知道了ino，由于ext3_inode大小已知，我们就可以计算出ext3_inode在索引节点区的位置( ino * sizeof(ext3_inode) )，而得到了ext3_inode，我们根据i_block就可以知道这个文件的数据存放的地点。将磁盘上ext3_inode的内容读入到ext3_inode_info中的函数是ext3_read_inode()。以一个有100 block的硬盘为例，一个文件系统的组织布局大致如下图。位图区中的每一位表示每一个相应的对象有没有被使用。

特殊文件
特殊文件在内存中有inode和dentry数据结构，但是不一定在存储介质上有"索引节点"，它断电之后的确就玩完了，所以不需要什么载体。当从一个特殊文件读时，所读出的数据是由系统内部按一定的规则临时生成的，或从内存中收集，加工出来的。sysfs里面就是典型的特殊文件。它存储的信息都是由系统动态的生成的，它动态的包含了整个机器的硬件资源情况。从sysfs读写就相当于向kobject层次结构提取数据。

还请注意, 我们谈到目录项和索引节点时，有两种含义。一种是在存储介质(硬盘)中的(如ext3_inode)，一种是在内存中的，后者是根据前者生成的。内存中的表示就是dentry和inode，它是VFS中的一层，不管什么样的文件系统，最后在内存中描述它的都是dentry和inode结构。我们使用不同的文件系统，就是将它们各自的文件信息都抽象到dentry和inode中去。这样对于高层来说，我们就可以不关心底层的实现，我们使用的都是一系列标准的函数调用。这就是VFS的精髓，实际上就是面向对象。

我们在进程中打开一个文件F，实际上就是要在内存中建立F的dentry,和inode结构，并让它们与进程结构联系来，把VFS中定义的接口给接起来。我们来看一看这个经典的图。这张图之于文件系统，就像每天爱你多一些之于张学友，番茄炒蛋之于复旦南区食堂，刻骨铭心。

Linux那些事儿之我是Sysfs(7)dentry与inode相关推荐

Linux那些事儿之我是Sysfs(6)文件系统
接下来,我们进入sysfs部分.看看 kobject_add()->kobject_add_varg->kobject_add_internal->create_dir()-> ...
Linux那些事儿之我是Sysfs(9)sysfs文件系统模型
最近Linus炮轰C++,"C++是一种糟糕的(horrible)语言.而且因为有大量不够标准的程序员在使用而使许多真正懂得底层问题,而不会折腾那些白痴'对象模型'".牛人就是牛气 ...
Linux那些事儿之我是Sysfs(8)一起散散步-pathwalk
前面说过,只要知道文件的索引节点号,就可以得到那个文件.但是我们在操作文件时,从没听说谁会拿着索引节点号来操作文件,我们只知道文件名而已.它们是如何"和谐"起来的呢?linux把目 ...
Linux那些事儿之我是Sysfs(11)sysfs 创建普通文件
sysfs文件系统中,普通文件对应于kobject中的属性.用sysfs_create_file(),参数如下: sysfs_create_file(struct kobject * kobj, co ...
Linux那些事儿之我是Sysfs(3)设备模型上层容器
§1 bus 系统中总线由struct bus_type描述,定义为: include/linux/device.h struct bus_type { const char *name;总线类型的名 ...
Linux那些事儿之我是Sysfs(2)linux设备底层模型
关于linux设备模型网上有一些论述,有些东西我就用了拿来主义,进行了修改和整理. §1 Kobject Kobject 是Linux 2.6引入的新的设备管理机制,在内核中由struct kobje ...
Linux那些事儿之我是Sysfs(1)sysfs初探
"sysfs is a ram-based filesystem initially based on ramfs. It provides a means to export kernel ...
linux lddbus设备,Linux那些事儿之我是Sysfs(4)举例一lddbus | 技术部落
接下来我们从例子着手 localhost:/home/XX/examples/lddbus#insmod lddbus.ko 此时再看/sys/bus/ 这时就多了一个文件夹ldd.里面的文件构成是这 ...
linux getdents 例子,Linux那些事儿之我是Sysfs(12)举例三：sysfs读入文件夹内容
上回我们说到,如何创建文件夹和文件.我们发现,在sysfs中,inode并不那么重要.这是因为我们所要读写的信息已经就在内存中,并且已经形成了层次结构.我们只需有dentry,就可以dentry-&g ...

Linux那些事儿之我是Sysfs(7)dentry与inode

Linux那些事儿之我是Sysfs(7)dentry与inode相关推荐

最新文章

热门文章