1、前言
想要了解EXT文件系统的工作原理,那了解文件系统在磁盘上的分布就是必不可少的。这一节主要介绍EXT文件系统硬盘存储的物理结构。
 由于当前主流的CPU架构均采用小端模式,因此下文介绍均已小端模式为准。
2、超级块
2.1 属性
下表列举出超级块中相对重要的属性。
| 属性名 | 含义 | 
|---|---|
| s_log_block_size | 块大小,计算公式 = 2 ^ (10 + s_log_block_size) | 
| s_blocks_per_group | 每个块组中块的个数 | 
| s_inodes_per_group | 每个块组中索引的个数 | 
| s_magic | 魔数(0xEF53) | 
| s_inode_size | 索引大小,单位:byte | 
| s_feature_compat | 兼容特性 | 
| s_feature_incompat | 不兼容特性 | 
| s_feature_ro_compat | 只读兼容特性 | 
| s_backup_bgs | 包含超级块备份的块组号。 | 
| s_desc_size | 块组描述符大小 | 
2.2 特性
一些默认开启或者常用的文件系统特性。
 
| 属性名 | 含义 | 
|---|---|
| COMPAT_HAS_JOURNAL | 开启日志。 | 
| COMPAT_EXT_ATTR | 支持扩展属性。 | 
| COMPAT_RESIZE_INODE | 保留块组描述符。需要开启RO_COMPAT_SPARSE_SUPER特性。 | 
| COMPAT_SPARSE_SUPER2 | 稀疏超级块V2。开启本特性后,仅s_backup_bgs 属性指向的2个块组备份超级块。 | 
| INCOMPAT_FILETYPE | app_ext4_dir_entry结构中包含文件类型。 | 
| INCOMPAT_META_BG | 开启元块组属性。与COMPAT_RESIZE_INODE特性互斥。 | 
| INCOMPAT_64BIT | 支持超过2^32个块。 | 
| INCOMPAT_FLEX_BG | 开启弹性块组。 | 
| INCOMPAT_INLINE_DATA | 支持内联文件和目录。 | 
| RO_COMPAT_SPARSE_SUPER | 稀疏超级块。 | 
2.3 参考代码
typedef struct {ub32 s_inodes_count;       /* Inodes count */ub32 s_blocks_count;       /* Blocks count */ub32 s_r_blocks_count;     /* Reserved blocks count */ub32 s_free_blocks_count;  /* Free blocks count */ub32 s_free_inodes_count;  /* Free inodes count */ub32 s_first_data_block;   /* First Data Block */ub32 s_log_block_size;     /* Block size */ub32 s_log_cluster_size;   /* Allocation cluster size */ub32 s_blocks_per_group;   /* # Blocks per group */ub32 s_clusters_per_group; /* # Fragments per group */ub32 s_inodes_per_group;   /* # Inodes per group */ub32 s_mtime;              /* Mount time */ub32 s_wtime;              /* Write time */ub16 s_mnt_count;          /* Mount count */ub16 s_max_mnt_count;      /* Maximal mount count */ub16 s_magic;              /* Magic signature */ub16 s_state;              /* File system state */ub16 s_errors;             /* Behaviour when detecting errors */ub16 s_minor_rev_level;    /* minor revision level */ub32 s_lastcheck;          /* time of last check */ub32 s_checkinterval;      /* max. time between checks */ub32 s_creator_os;         /* OS */ub32 s_rev_level;          /* Revision level */ub16 s_def_resuid;         /* Default uid for reserved blocks */ub16 s_def_resgid;         /* Default gid for reserved blocks *//** These fields are for EXT2_DYNAMIC_REV superblocks only.** Note: the difference between the compatible feature set and* the incompatible feature set is that if there is a bit set* in the incompatible feature set that the kernel doesn't* know about, it should refuse to mount the filesystem.** e2fsck's requirements are more strict; if it doesn't know* about a feature in either the compatible or incompatible* feature set, it must abort and not try to meddle with* things it doesn't understand...*/ub32 s_first_ino;              /* First non-reserved inode */ub16 s_inode_size;             /* size of inode structure */ub16 s_block_group_nr;         /* block group # of this superblock */ub32 s_feature_compat;         /* compatible feature set */ub32 s_feature_incompat;       /* incompatible feature set */ub32 s_feature_ro_compat;      /* readonly-compatible feature set */ub8 s_uuid[16];                /* 128-bit uuid for volume */b8 s_volume_name[16];          /* volume name */b8 s_last_mounted[64];         /* directory where last mounted */ub32 s_algorithm_usage_bitmap; /* For compression *//** Performance hints.  Directory preallocation should only* happen if the EXT2_FEATURE_COMPAT_DIR_PREALLOC flag is on.*/ub8 s_prealloc_blocks;      /* Nr of blocks to try to preallocate*/ub8 s_prealloc_dir_blocks;  /* Nr to preallocate for dirs */ub16 s_reserved_gdt_blocks; /* Per group table for online growth *//** Journaling support valid if EXT2_FEATURE_COMPAT_HAS_JOURNAL set.*/ub8 s_journal_uuid[16]; /* uuid of journal superblock */ub32 s_journal_inum;    /* inode number of journal file */ub32 s_journal_dev;     /* device number of journal file */ub32 s_last_orphan;     /* start of list of inodes to delete */ub32 s_hash_seed[4];    /* HTREE hash seed */ub8 s_def_hash_version; /* Default hash version to use */ub8 s_jnl_backup_type;  /* Default type of journal backup */ub16 s_desc_size;       /* Group desc. size: INCOMPAT_64BIT */ub32 s_default_mount_opts;ub32 s_first_meta_bg;       /* First metablock group */ub32 s_mkfs_time;           /* When the filesystem was created */ub32 s_jnl_blocks[17];      /* Backup of the journal inode */ub32 s_blocks_count_hi;     /* Blocks count high 32bits */ub32 s_r_blocks_count_hi;   /* Reserved blocks count high 32 bits*/ub32 s_free_blocks_hi;      /* Free blocks count */ub16 s_min_extra_isize;     /* All inodes have at least # bytes */ub16 s_want_extra_isize;    /* New inodes should reserve # bytes */ub32 s_flags;               /* Miscellaneous flags */ub16 s_raid_stride;         /* RAID stride */ub16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ub64 s_mmp_block;           /* Block for multi-mount protection */ub32 s_raid_stripe_width;   /* blocks on all data disks (N*stride)*/ub8 s_log_groups_per_flex;  /* FLEX_BG group size */ub8 s_reserved_char_pad;ub16 s_reserved_pad;            /* Padding to next 32bits */ub64 s_kbytes_written;          /* nr of lifetime kilobytes written */ub32 s_snapshot_inum;           /* Inode number of active snapshot */ub32 s_snapshot_id;             /* sequential ID of active snapshot */ub64 s_snapshot_r_blocks_count; /* reserved blocks for activesnapshot's future use */ub32 s_snapshot_list;     /* inode number of the head of the on-disk snapshotlist */ub32 s_error_count;       /* number of fs errors */ub32 s_first_error_time;  /* first time an error happened */ub32 s_first_error_ino;   /* inode involved in first error */ub64 s_first_error_block; /* block involved of first error */ub8 s_first_error_func[32]; /* function where the error happened */ub32 s_first_error_line;    /* line number where error happened */ub32 s_last_error_time;     /* most recent time of an error */ub32 s_last_error_ino;      /* inode involved in last error */ub32 s_last_error_line;     /* line number where error happened */ub64 s_last_error_block;    /* block involved of last error */ub8 s_last_error_func[32];  /* function where the error happened */ub8 s_mount_opts[64];ub32 s_usr_quota_inum;  /* inode number of user quota file */ub32 s_grp_quota_inum;  /* inode number of group quota file */ub32 s_overhead_blocks; /* overhead blocks/clusters in fs */ub32 s_backup_bgs[2];   /* If sparse_super2 enabled */ub32 s_reserved[106];   /* Padding to the end of the block */ub32 s_checksum;        /* crc32c(superblock) */
} app_ext4_super_block;
3、组描述符
3.1 属性
下表列举出组描述符的关键属性。
| 属性名 | 含义 | 
|---|---|
| bg_inode_table | 索引表的物理偏移。 | 
| bg_inode_table_hi | 索引表的物理偏移的高32位。 | 
3.2 索引表计算
已知目标文件的Inode = 357,每个块组的Inode数 inode_count_ = 8192, 组描述大小 gdt_size_ = 32,索引Inode大小 inode_size_ = 256,该如何找到文件对应的组描述符呢?
首先,计算出文件所在的块组,bg_no = (inode_no - 1) / inode_count_ = 356 / 8192 = 0, 即文件属于第一个块组。
接着,计算文件所在的组描述符的位置,gdt_block_no = bg_no / gdt_count_ = 0 / (4096 / 32) = 0,即文件所在的组描述符在块组文件描述符的第一个块中。
然后,计算文件所在的组描述符在块中的位置,gdt_index = bg_no % gdt_count_ = 0,块中的第一个组描述符即文件所在的组描述符。
其次,计算文件在所在块组中的索引, inode_partition = (inode_no - 1) % inode_count_ = 356 % 8192 = 356, 即文件是块组的第356个inode节点。
再次,计算文件在索引表中的位置,inode_block_no = inode_partition / it_inode_count = 356 / (4096 / 256) = 22, 即文件所在的索引在索引表的第22个块中。
最后,从组描述的bg_inode_table和bg_inode_table_hi获取inode_table_no,计算出索引表的偏移位置file_offset = (inode_table_no + inode_block_no) * 4096。
默认情况下,所有的组描述符在第一个块组中都存在备份,因此从第一个块组中读取对应的组描述符即可。
// inode 0 is defined but not exist, so actual inode no begin with 1.
// the bg number of the inode_no
b32 bg_no = (inode_no - 1) / volume_->inode_count_;
// the gdt number in bg
b32 gdt_block_no = bg_no / volume_->gdt_count_;
// the index of gdt in the bg which this inode in
b32 gdt_index = bg_no % volume_->gdt_count_;
// the index of inode in the bg which this inode in
b32 inode_partition = (inode_no - 1) % volume_->inode_count_;
// the inode count in one IT block
b32 it_inode_count = volume_->block_size_ / volume_->inode_size_;
// the index of IT block in the bg which this inode in
b32 inode_block_no = inode_partition / it_inode_count;
// move file pointer to gdt blockb64 file_offset = 0;
if (volume_->meta_group_)file_offset = GetGDTOffset(gdt_block_no * (b64)volume_->gdt_count_);
else// use gdt in first bgfile_offset = GetGDTOffset(0) + gdt_block_no * (b64)volume_->block_size_;
if (lseek64(volume_->fd_, file_offset, SEEK_SET) != file_offset) goto IOErr;gdt_record_ = (app_ext4_group_desc *)new char[volume_->block_size_];
if (volume_->block_size_ !=read(volume_->fd_, gdt_record_, volume_->block_size_))goto IOErr;// get offset of block which inode in
if (!volume_->extend64_) {file_offset = (gdt_record_[gdt_index].bg_inode_table + inode_block_no) *(b64)volume_->block_size_;
} else {app_ext4_group_desc64 *gdt_record =(app_ext4_group_desc64 *)((char *)gdt_record_.get() +volume_->gdt_size_ * gdt_index);b64 inode_table_no =gdt_record->bg_inode_table | ((b64)gdt_record->bg_inode_table_hi << 32);file_offset = (inode_table_no + inode_block_no) * volume_->block_size_;
}
if (lseek64(volume_->fd_, file_offset, SEEK_SET) != file_offset) goto IOErr;inode_record_ = (app_ext4_inode *)new char[volume_->block_size_];
if (volume_->block_size_ !=read(volume_->fd_, inode_record_, volume_->block_size_))goto IOErr;
3.3 参考代码
typedef struct {ub32 bg_block_bitmap;         /* Blocks bitmap block */ub32 bg_inode_bitmap;         /* Inodes bitmap block */ub32 bg_inode_table;          /* Inodes table block */ub16 bg_free_blocks_count;    /* Free blocks count */ub16 bg_free_inodes_count;    /* Free inodes count */ub16 bg_used_dirs_count;      /* Directories count */ub16 bg_flags;                /* EXT4_BG_flags (INODE_UNINIT, etc) */ub32 bg_exclude_bitmap_lo;    /* Exclude bitmap for snapshots */ub16 bg_block_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */ub16 bg_inode_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */ub16 bg_itable_unused;        /* Unused inodes count */ub16 bg_checksum;             /* crc16(sb_uuid+group+desc) */
} app_ext4_group_desc;typedef struct {ub32 bg_block_bitmap;         /* Blocks bitmap block */ub32 bg_inode_bitmap;         /* Inodes bitmap block */ub32 bg_inode_table;          /* Inodes table block */ub16 bg_free_blocks_count;    /* Free blocks count */ub16 bg_free_inodes_count;    /* Free inodes count */ub16 bg_used_dirs_count;      /* Directories count */ub16 bg_flags;                /* EXT4_BG_flags (INODE_UNINIT, etc) */ub32 bg_exclude_bitmap_lo;    /* Exclude bitmap for snapshots */ub16 bg_block_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */ub16 bg_inode_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */ub16 bg_itable_unused;        /* Unused inodes count */ub16 bg_checksum;             /* crc16(sb_uuid+group+desc) */ub32 bg_block_bitmap_hi;      /* Blocks bitmap block MSB */ub32 bg_inode_bitmap_hi;      /* Inodes bitmap block MSB */ub32 bg_inode_table_hi;       /* Inodes table block MSB */ub16 bg_free_blocks_count_hi; /* Free blocks count MSB */ub16 bg_free_inodes_count_hi; /* Free inodes count MSB */ub16 bg_used_dirs_count_hi;   /* Directories count MSB */ub16 bg_itable_unused_hi;     /* Unused inodes count MSB */ub32 bg_exclude_bitmap_hi;    /* Exclude bitmap block MSB */ub16 bg_block_bitmap_csum_hi; /* crc32c(s_uuid+grp_num+bitmap) MSB */ub16 bg_inode_bitmap_csum_hi; /* crc32c(s_uuid+grp_num+bitmap) MSB */ub32 bg_reserved;
} app_ext4_group_desc64;
4、索引节点
4.1 属性
下表列举出Inode中相对重要的属性。
| 属性名 | 含义 | 
|---|---|
| i_mode | 文件属性和文件类型。 | 
| i_size_lo | 文件大小低32位。 | 
| i_links_count | 硬链接数量。 | 
| i_flags | 标志位。 | 
| i_block | 块图或者扩展树,存储文件内容或者目录索引。 | 
| i_size_high | 文件大小高32位。 | 
| i_extra_isize | 扩展属性大小。 | 
4.2 文件标识
| 值 | 含义 | 
|---|---|
| 0x1000 | S_IFIFO (FIFO) | 
| 0x2000 | S_IFCHR (Character device) | 
| 0x4000 | S_IFDIR (Directory) | 
| 0x6000 | S_IFBLK (Block device) | 
| 0x8000 | S_IFREG (Regular file) | 
| 0xA000 | S_IFLNK (Symbolic link) | 
| 0xC000 | S_IFSOCK (Socket) | 
4.3 文件内容
通常情况下,i_block中用于存储文件所有块的索引信息。某些特殊场景下,会用于其它情况。
-  软链接(Symbolic Links) 
 当链接的目标路径长度小于60时, 会将目标路径存储在i_block中。
-  内联数据(Inline Data) 
 当文件系统开启Inline Data特性,且数据长度小于156(目前)时,用于存储内容的前60个字节。
-  直接/间接块索引(Direct/Indirect Block Addressing) 
 i_block[0:11]:存储数据内容的块号。
 i_block[12] :指向间接数据块(存储数据块号的数据块)。
 i_block[13]:指向双重间接数据块(存储间接数据块的数据块)。
 i_block[14]:指向三重间接数据块(存储双重间接数据块的数据块)。
  
-  扩展树索引(Extent Tree) 
 通过树的形式管理文件或者文件夹的数据块。扩展树的详细介绍请参考最后一节。
4.4 参考代码
#define EXT4_N_BLOCKS 15
typedef struct {ub16 i_mode;        /* File mode */ub16 i_uid;         /* Low 16 bits of Owner Uid */ub32 i_size;        /* Size in bytes */ub32 i_atime;       /* Access time */ub32 i_ctime;       /* Inode Change time */ub32 i_mtime;       /* Modification time */ub32 i_dtime;       /* Deletion Time */ub16 i_gid;         /* Low 16 bits of Group Id */ub16 i_links_count; /* Links count */ub32 i_blocks;      /* Blocks count */ub32 i_flags;       /* File flags */union {struct {ub32 l_i_version; /* was l_i_reserved1 */} linux1;struct {ub32 h_i_translator;} hurd1;} osd1;                      /* OS dependent 1 */ub32 i_block[EXT4_N_BLOCKS]; /* Pointers to blocks */ub32 i_generation;           /* File version (for NFS) */ub32 i_file_acl;             /* File ACL */ub32 i_size_high;            /* Formerly i_dir_acl, directory ACL */ub32 i_faddr;                /* Fragment address */union {struct {ub16 l_i_blocks_hi;ub16 l_i_file_acl_high;ub16 l_i_uid_high;    /* these 2 fields    */ub16 l_i_gid_high;    /* were reserved2[0] */ub16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */ub16 l_i_reserved;} linux2;struct {ub8 h_i_frag;  /* Fragment number */ub8 h_i_fsize; /* Fragment size */ub16 h_i_mode_high;ub16 h_i_uid_high;ub16 h_i_gid_high;ub32 h_i_author;} hurd2;} osd2; /* OS dependent 2 */ub16 i_extra_isize;ub16 i_checksum_hi;  /* crc32c(uuid+inum+inode) */ub32 i_ctime_extra;  /* extra Change time (nsec << 2 | epoch) */ub32 i_mtime_extra;  /* extra Modification time (nsec << 2 | epoch) */ub32 i_atime_extra;  /* extra Access time (nsec << 2 | epoch) */ub32 i_crtime;       /* File creation time */ub32 i_crtime_extra; /* extra File creation time (nsec << 2 | epoch)*/ub32 i_version_hi;   /* high 32 bits for 64-bit version */
} app_ext4_inode;
5、扩展属性
扩展属性通常用于存储文件的ACLs访问权限和一些其他的安全属性,例如selinux等。因此通常情况下,使用文件系统时并不需要关注文件的扩展属性。
 当有一种例外情况,那就是开启了内联数据特性后,文件的一部分数据内容会存储到扩展属性中。
 我们可以在2个地方找到文件的扩展属性,其一,2个索引信息的中间;其二,i_file_acl指向的块。而内联数据则存在于第一个地方。
 扩展属性块以app_ext4_attr_header结构开始,但在索引信息后时只存在第一个字段h_magic = 0xEA020000。
 实际的扩展属性用app_ext4_attr_entry管理,当e_name_index = 7且e_name = data时,则代表内联数据。
typedef struct {ub32 h_magic;       /* magic number for identification */ub32 h_refcount;    /* reference count */ub32 h_blocks;      /* number of disk blocks used */ub32 h_hash;        /* hash value of all attributes */ub32 h_reserved[4]; /* zero right now */
} app_ext4_attr_header;typedef struct {ub8 e_name_len;     /* length of name */ub8 e_name_index;   /* attribute name index */ub16 e_value_offs;  /* offset in disk block of value */ub32 e_value_block; /* disk block attribute is stored on (n/i) */ub32 e_value_size;  /* size of attribute value */ub32 e_hash;        /* hash value of name and value */
} app_ext4_attr_entry;// 获取扩展内联数据
app_ext4_attr_header *attr_header =(app_ext4_attr_header *)((b8 *)&inode_info_->i_extra_isize +inode_info_->i_extra_isize);if (attr_header->h_magic != kExtAttrMagic) return false;// Extended attributes, when stored after the inode,// have a header ext4_xattr_ibody_header that is 4 bytes longapp_ext4_attr_entry *attr_data =(app_ext4_attr_entry *)((b8 *)attr_header + sizeof(attr_header->h_magic));while (attr_data->e_name_index != kExtAttrDataIdx ||attr_data->e_name_len != sizeof(kExtAttrDataName)) {attr_data =(app_ext4_attr_entry *)((b8 *)attr_data + sizeof(app_ext4_attr_entry) +(attr_data->e_name_len + 3) / 4 * 4);}// For an inode attribute e_value_offs is relative to the first entryif (*(b32 *)((b8 *)attr_data + sizeof(app_ext4_attr_entry)) ==kExtAttrDataName) {memcpy(inline_data_,(b8 *)attr_header + sizeof(attr_header->h_magic) +attr_data->e_value_offs,attr_data->e_value_size);}
6、扩展树
由于直接/间接块索引的种种缺陷,在EXT4中推出了扩展树取而代之。扩展树,顾名思义,通过树的形式管理数据块。
 其中每个节点以app_ext4_extent_header开始,非叶子节点时,后接app_ext4_extent_idx结构;叶子节点则紧跟app_ext4_extent结构。
 app_ext4_extent_header用于存储当前节点的信息。
| 变量 | 含义 | 
|---|---|
| eh_magic | 魔数,0xF30A。 | 
| eh_entries | 当前节点存储的数据个数。 | 
| eh_depth | 当前节点的深度,0则代表当前是叶子节点。 | 
app_ext4_extent存储实际的数据块信息。
| 变量 | 含义 | 
|---|---|
| ee_block | 起始的逻辑块地址。 | 
| ee_len | 当前extent管理的实际物理块个数。ee_len = ee_len > 32768 ? ee_len - 32768 : ee_len | 
| ee_start_hi / ee_start | 按位或即可得出起始的物理块地址。 | 
app_ext4_extent_idx存储下一层节点的信息。
| 变量 | 含义 | 
|---|---|
| ei_block | 起始的逻辑块地址。 | 
| ei_leaf/ ei_leaf_hi | 按位或即可得出下一层节点的物理块地址。 | 
typedef struct {ub16 eh_magic;      /* probably will support different formats */ub16 eh_entries;    /* number of valid entries */ub16 eh_max;        /* capacity of store in entries */ub16 eh_depth;      /* has tree real underlaying blocks? */ub32 eh_generation; /* generation of the tree */
} app_ext4_extent_header;typedef struct {ub32 ee_block;    /* first logical block extent covers */ub16 ee_len;      /* number of blocks covered by extent */ub16 ee_start_hi; /* high 16 bits of physical block */ub32 ee_start;    /* low 32 bigs of physical block */
} app_ext4_extent;typedef struct {ub32 ei_block;   /* index covers logical blocks from 'block' */ub32 ei_leaf;    /* pointer to the physical block of the next ** level. leaf or next index could bet here */ub16 ei_leaf_hi; /* high 16 bits of physical block */ub16 ei_unused;
} app_ext4_extent_idx;