從rm到linux虛擬檔案系統

Linkerist發表於2019-01-25

從rm到linux虛擬檔案系統


後端有時候rm,會出現一些問題。

這裡作為一個子問題,討論一下rm之後,發生的一些事。

開啟rm原始碼:

[qianzichen@dev03v /src/app/coreutils/coreutils-8.21]$ vi src/rm.c
複製程式碼

從main函式開始:

int
main (int argc, char **argv)
{
  ...
  while ((c = getopt_long (argc, argv, "dfirvIR", long_opts, NULL)) != -1)
    {
      switch (c)
        {
        case 'f':
          x.interactive = RMI_NEVER;
          break;
        ...
        }
    }
  ...
  enum RM_status status = rm (file, &x);
}
複製程式碼

首先解析命令列引數,然後呼叫了rm:

enum RM_status status = rm (file, &x);
複製程式碼

作者把rm函式的實現從rm.c中抽了出來,放在remove.c中:

/* Remove FILEs, honoring options specified via X.
   Return RM_OK if successful.  */
enum RM_status
rm (char *const *file, struct rm_options const *x) 
{
  enum RM_status rm_status = RM_OK;

  if (*file)
    {
      FTS *fts = xfts_open (file, bit_flags, NULL);
      while (1)
        {
           ...
        }
    }
...
}
複製程式碼

file引數是一個只讀指標陣列,代表要刪除的檔名列表,x引數的結構定義如下,儲存從命令列中解析後的rm的選項。

struct rm_options
{
  /* If true, ignore nonexistent files.  */
  bool ignore_missing_files;

  /* If true, query the user about whether to remove each file.  */
  enum rm_interactive interactive;
...
  /* If true, recursively remove directories.  */
  bool recursive;
  bool require_restore_cwd;
};
複製程式碼

當file列表存在時,rm呼叫xfts_open:

FTS *
xfts_open (char * const *argv, int options,
           int (*compar) (const FTSENT **, const FTSENT **))
{
  FTS *fts = fts_open (argv, options | FTS_CWDFD, compar);
  if (fts == NULL)
    {
...
  return fts;
}
複製程式碼

xfts_open返回fts_open的有效返回值。fts_open的實現如下:

FTS *
fts_open (char * const *argv,
          register int options,
          int (*compar) (FTSENT const **, FTSENT const **))
{
        register FTS *sp;

        /* Options check. */
        /* Allocate/initialize the stream */
        /* Initialize fts_cwd_fd.  */
        sp->fts_cwd_fd = AT_FDCWD;
        if ( ISSET(FTS_CWDFD) && ! HAVE_OPENAT_SUPPORT)
          {
            int fd = open (".",
                           O_SEARCH | (ISSET (FTS_NOATIME) ? O_NOATIME : 0));
        /*
         * Start out with 1K of file name space, and enough, in any case,
         * to hold the user's file names.
         */
        /* Allocate/initialize root's parent. */
        if (*argv != NULL) {
                if ((parent = fts_alloc(sp, "", 0)) == NULL)
                        goto mem2;
                parent->fts_level = FTS_ROOTPARENTLEVEL;
          }

        /* Allocate/initialize root(s). */
        for (root = NULL, nitems = 0; *argv != NULL; ++argv, ++nitems) {
                /*
                 * If comparison routine supplied, traverse in sorted
                 * order; otherwise traverse in the order specified.
                 */
                if (compar) {
                        p->fts_link = root;
                        root = p;
                } else {
                        p->fts_link = NULL;
                        if (root == NULL)
                                tmp = root = p;
                        else {
                                tmp->fts_link = p;
                                tmp = p;
                        }
                }
        }
        if (compar && nitems > 1)
                root = fts_sort(sp, root, nitems);
...  
        if (!ISSET(FTS_NOCHDIR) && !ISSET(FTS_CWDFD)
            && (sp->fts_rfd = diropen (sp, ".")) < 0)
                SET(FTS_NOCHDIR);

        i_ring_init (&sp->fts_fd_ring, -1);
        return (sp);

mem3:   fts_lfree(root);
...
        return (NULL);
}
複製程式碼

引用中已去除了一些Error handling,可以看出主要是獲取檔案系統的一些資訊,儲存在FTS結構中,FTS結構定義如下:

typedef struct {
        struct _ftsent *fts_cur;        /* current node */
        int (*fts_compar) (struct _ftsent const **, struct _ftsent const **);
                                        /* compare fn */
...
        int fts_options;                /* fts_open options, global flags */
        struct hash_table *fts_leaf_optimization_works_ht;
        union {
...
                struct cycle_check_state *state;
        } fts_cycle;

        I_ring fts_fd_ring;
} FTS;
複製程式碼

再回到rm函式,它將在一個loop中通過fts_read讀取檔案系統資訊,並快取在ent中:

rm (char *const *file, struct rm_options const *x) 
{
  enum RM_status rm_status = RM_OK;

  if (*file)
    {
      FTS *fts = xfts_open (file, bit_flags, NULL);
      while (1)
        {
           ent = fts_read (fts);
           enum RM_status s = rm_fts (fts, ent, x);
        }
    }
...
}
複製程式碼

ent的結構比較大,這裡不展開了。

再通過rm_fts對某一個ent進行操作,這裡我們rm的是一個regular file,所以控制結構會執行到FTS_F分支下,最終呼叫execise。

static enum RM_status
rm_fts (FTS *fts, FTSENT *ent, struct rm_options const *x)
{
  switch (ent->fts_info)
    {
    case FTS_D:			/* preorder directory */
        if (s == RM_OK && is_empty_directory == T_YES)
          {
            /* When we know (from prompt when in interactive mode)
               that this is an empty directory, don't prompt twice.  */
            s = excise (fts, ent, x, true);
            fts_skip_tree (fts, ent);
          }
          ...
      }
    case FTS_F:			/* regular file */
      {
        bool is_dir = ent->fts_info == FTS_DP || ent->fts_info == FTS_DNR;
        enum RM_status s = prompt (fts, ent, is_dir, x, PA_REMOVE_DIR, NULL);
        if (s != RM_OK)
          return s;
        return excise (fts, ent, x, is_dir);
      }
    ...
    }
}
複製程式碼

這裡再次忽略一些容錯和優化,execise最終呼叫了unlinkat

static enum RM_status
excise (FTS *fts, FTSENT *ent, struct rm_options const *x, bool is_dir)
{
  int flag = is_dir ? AT_REMOVEDIR : 0;
  if (unlinkat (fts->fts_cwd_fd, ent->fts_accpath, flag) == 0)
    {
      if (x->verbose)
        {
          printf ((is_dir
                   ? _("removed directory: %s\n")
          ...
        }
      return RM_OK;
    }
  ...
}
複製程式碼

如上我們看出,rm最終呼叫了unlinkat這一核心函式,比如,刪除a.txt:

unlinkat(AT_FDCWD, "a.txt", 0)
複製程式碼

使用者態rm呼叫了C庫中的unlinkat,經查詢,其宣告是在<unistd.h>中

#ifdef __USE_ATFILE
/* Remove the link NAME relative to FD.  */
extern int unlinkat (int __fd, const char *__name, int __flag)
     __THROW __nonnull ((2));
#endif

/* Remove the directory PATH.  */
extern int rmdir (const char *__path) __THROW __nonnull ((1));
複製程式碼

使用者態程式只要呼叫unlink函式就可以了,具體unlinkat函式的實現是由glibc 提供的,其定義在io/unlink.c中:

* Remove the link named NAME.  */
int
__unlink (name)
     const char *name;
{
  if (name == NULL)
    {   
      __set_errno (EINVAL);
      return -1; 
    }   

  __set_errno (ENOSYS);
  return -1; 
}
stub_warning (unlink)

weak_alias (__unlink, unlink)
複製程式碼

額好吧,這兒是個弱符號,真正的實現在./sysdeps/unix/sysv/linux/unlinkat.c

...
/* Remove the link named NAME.  */
int
unlinkat (fd, file, flag)
     int fd;
     const char *file;
     int flag;
{
  int result;

#ifdef __NR_unlinkat
# ifndef __ASSUME_ATFCTS
  if (__have_atfcts >= 0)
# endif
    {
      result = INLINE_SYSCALL (unlinkat, 3, fd, file, flag);
# ifndef __ASSUME_ATFCTS
      if (result == -1 && errno == ENOSYS)
        __have_atfcts = -1;
      else
# endif
        return result;
    }
  char *buf = NULL;
    }
...
  INTERNAL_SYSCALL_DECL (err);

  if (flag & AT_REMOVEDIR)
    result = INTERNAL_SYSCALL (rmdir, err, 1, file);
  else
    result = INTERNAL_SYSCALL (unlink, err, 1, file);
...
}
複製程式碼

syscall的name為__NR_##name,通過巨集中字串粘合而得本例中的__NR_unlinkat。其定義在/usr/include/asm/unistd_64.h中。

#ifndef _ASM_X86_UNISTD_64_H
#define _ASM_X86_UNISTD_64_H 1

#define __NR_read 0
#define __NR_write 1
...
#define __NR_newfstatat 262
#define __NR_unlinkat 263
...
#define __NR_kexec_file_load 320
#define __NR_userfaultfd 323

#endif /* _ASM_X86_UNISTD_64_H */
複製程式碼

所以該巨集被啟用。

/* The *at syscalls were introduced just after 2.6.16-rc1.  Due to the way the
   kernel versions are advertised we can only rely on 2.6.17 to have
   the code.  On PPC they were introduced in 2.6.17-rc1,
   on SH in 2.6.19-rc1.  */
#if __LINUX_KERNEL_VERSION >= 0x020611 \
    && (!defined __sh__ || __LINUX_KERNEL_VERSION >= 0x020613)
# define __ASSUME_ATFCTS        1
#endif
複製程式碼

顯然可以看出,若kernel版本在2.6.17之後,__ASSUME_ATFCTS巨集被啟用。無需校驗__have_atfcts >= 0,直接呼叫INLINE_SYSCALL (unlinkat, 3, fd, file, flag)。

這裡直接看底層實現吧(./sysdeps/unix/sysv/linux/x86_64/sysdep.h),是一段內聯彙編:

# undef INLINE_SYSCALL_TYPES
# define INLINE_SYSCALL_TYPES(name, nr, args...) \
  ({                                                                          \
    unsigned long int resultvar = INTERNAL_SYSCALL_TYPES (name, , nr, args);  \
    if (__builtin_expect (INTERNAL_SYSCALL_ERROR_P (resultvar, ), 0))         \
      {                                                                       \
        __set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, ));                   \
        resultvar = (unsigned long int) -1;                                   \
      }                                                                       \
    (long int) resultvar; })

# undef INTERNAL_SYSCALL_DECL
# define INTERNAL_SYSCALL_DECL(err) do { } while (0)

# define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
  ({                                                                          \
    unsigned long int resultvar;                                              \
    LOAD_ARGS_##nr (args)                                                     \
    LOAD_REGS_##nr                                                            \
    asm volatile (                                                            \
    "syscall\n\t"                                                             \
    : "=a" (resultvar)                                                        \
    : "0" (name) ASM_ARGS_##nr : "memory", "cc", "r11", "cx");                \
    (long int) resultvar; })
# undef INTERNAL_SYSCALL
# define INTERNAL_SYSCALL(name, err, nr, args...) \
  INTERNAL_SYSCALL_NCS (__NR_##name, err, nr, ##args)

# define INTERNAL_SYSCALL_NCS_TYPES(name, err, nr, args...) \
複製程式碼

在syscall之前先將引數傳入暫存器。返回值在eax暫存器中,通常0表示成功。

從C庫程式碼上來看,就是這麼實現了的,rm實用程式呼叫glibc,然後再到彙編syscall -> kernel

但是當前機器安裝的不一定是upstream的C庫。

我們還是來親眼看一下最終機器碼是如何實現的吧,我這裡直接反彙編一下:

[qianzichen@dev03v /usr/lib64]$ objdump -D -S libc.so.6 > /tmp/libc.txt
[qianzichen@dev03v /usr/lib64]$ cd /tmp
[qianzichen@dev03v /tmp]$ grep -A12 'unlinkat' libc.txt 
00000000000e9c00 <unlinkat>:
   e9c00:       48 63 d2                movslq %edx,%rdx
   e9c03:       48 63 ff                movslq %edi,%rdi
   e9c06:       b8 07 01 00 00          mov    $0x107,%eax
   e9c0b:       0f 05                   syscall 
   e9c0d:       48 3d 00 f0 ff ff       cmp    $0xfffffffffffff000,%rax
   e9c13:       77 02                   ja     e9c17 <unlinkat+0x17>
   e9c15:       f3 c3                   repz retq 
   e9c17:       48 8b 15 4a 12 2d 00    mov    0x2d124a(%rip),%rdx        # 3bae68 <_DYNAMIC+0x2e8>
   e9c1e:       f7 d8                   neg    %eax
   e9c20:       64 89 02                mov    %eax,%fs:(%rdx)
   e9c23:       48 83 c8 ff             or     $0xffffffffffffffff,%rax
   e9c27:       c3                      retq   
   e9c28:       0f 1f 84 00 00 00 00    nopl   0x0(%rax,%rax,1)
   e9c2f:       00 

00000000000e9c30 <rmdir>:
   e9c30:       b8 54 00 00 00          mov    $0x54,%eax
   e9c35:       0f 05                   syscall 
[qianzichen@dev03v /tmp]$
複製程式碼

這裡可以看到glibc-2.17最終使用了一些AT&T syntax Assembly language。

先用一個比較新的指令movslq,把第一個暫存器擴充套件到64位並複製到第二個暫存器中,不填充符號位。

下一步,將0x107這個值載入eax暫存器

隨後,呼叫syscall指令。

開啟Intel的相關晶片手冊,搜尋“syscall”,找到相關描述如下圖。

datasheet 中關於 syscall 的描述
從這段描述中看出,syscall是Intel對64位處理器做的優化,被設計用來為作業系統提供一個平面記憶體模式,我的當前64位機器,syscall/sysret就和32位體系上的sysenter/sysexit的作用相似,可能和舊平臺的int 80中斷類似,主要是將CPU執行級別從level 3升級為level 0,操作一些應用層無法訪問的資源。

從"Use CPUID to check if SYSCALL and SYSRET are available (CPUID.80000001H.EDX[bit 11] = 1)"這一句可以看出,在呼叫前需要置edx暫存器中的11位來使能64位平臺的syscall/sysret,好的我們找出edx暫存器相關。

edx暫存器相關
之前操作edx暫存器,就是“使能bit 11位和bit 29”這種準備工作。

我們確定了,unlinkat是一個system call,rm實用程式將刪除檔案的任務交給作業系統,至此程式陷入核心態。

好的,我們現在到kernel下,直接搜尋unlinkat:

[qianzichen@dev03v /src/linux/linux]$ grep unlinkat ./ -rn
./arch/parisc/include/uapi/asm/unistd.h:297:#define __NR_unlinkat               (__NR_Linux + 281)
./arch/parisc/kernel/syscall_table.S:379:       ENTRY_SAME(unlinkat)
./arch/m32r/include/uapi/asm/unistd.h:309:#define __NR_unlinkat         301
./arch/m32r/kernel/syscall_table.S:303: .long sys_unlinkat
./arch/sparc/include/uapi/asm/unistd.h:358:#define __NR_unlinkat                290
./arch/sparc/kernel/systbls_32.S:78:/*290*/     .long sys_unlinkat, 
./arch/ia64/include/uapi/asm/unistd.h:279:#define __NR_unlinkat                 1287
./arch/ia64/kernel/entry.S:1695:        data8 sys_unlinkat
./arch/ia64/kernel/fsys.S:815:  data8 0                         // unlinkat
./arch/alpha/include/uapi/asm/unistd.h:420:#define __NR_unlinkat                        456
./arch/alpha/kernel/systbls.S:477:      .quad sys_unlinkat
...
./arch/x86/entry/syscalls/syscall_32.tbl:310:301        i386    unlinkat                sys_unlinkat
./arch/x86/entry/syscalls/syscall_64.tbl:272:263        common  unlinkat                sys_unlinkat
...
[qianzichen@dev03v /src/linux/linux]$
複製程式碼

直接看x86體系下的原始碼:

[qianzichen@dev03v /src/linux/linux]$ vi arch/x86/entry/syscalls/syscall_64.tbl
複製程式碼

這是一個列表檔案,

#
# 64-bit system call numbers and entry vectors
#
# The format is:
# <number> <abi> <name> <entry point>
#
# The abi is "common", "64" or "x32" for this file.
#
0	common	read			sys_read
...
261	common	futimesat		sys_futimesat
262	common	newfstatat		sys_newfstatat
263	common	unlinkat		sys_unlinkat
264	common	renameat		sys_renameat
265	common	linkat			sys_linkat
...

#
# x32-specific system call numbers start at 512 to avoid cache impact
# for native 64-bit operation.
#
512	x32	rt_sigaction		compat_sys_rt_sigaction
...
複製程式碼

這裡看出,unlinkat對應的number是263 還記得寫入eax暫存器中的值嗎,是0x107。 很顯然,0x107 = 1 * 16 ^ 2 + 0 * 16 ^ 1 + 7 * 16 ^ 0 = 263

common代表32/64位平臺通用 user space 和 kernel space 的 system call 對映建立。

其實kernel space對編號的對映不是這麼簡單,這裡不再展開。

我們大概知道 user space 的 unlinkat 最終在 kernel space 的 entry point 是 sys_unlinkat 就好了。

還是直接檢視彙編程式碼吧:

[qianzichen@dev03v /src/linux/linux]$ vi arch/x86/entry/entry_64.S
複製程式碼
...
ENTRY(entry_SYSCALL_64)
        /*   
         * Interrupts are off on entry.
         * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
         * it is too small to ever cause noticeable irq latency.
         */
        SWAPGS_UNSAFE_STACK
        movq    %rsp, PER_CPU_VAR(rsp_scratch)
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp

        TRACE_IRQS_OFF

        /* Construct struct pt_regs on stack */
        pushq   $__USER_DS
...
        ja      1f                              /* return -ENOSYS (already in pt_regs->ax) */
        movq    %r10, %rcx

        /*
         * This call instruction is handled specially in stub_ptregs_64.
         * It might end up jumping to the slow path.  If it jumps, RAX
         * and all argument registers are clobbered.
         */
        call    *sys_call_table(, %rax, 8)
...
END(entry_SYSCALL_64)
複製程式碼

rax中存的就是這次syscall的num,即__NR_unlinkat。

ENTRY(entry_SYSCALL_64)是64位的 syscall 彙編入口點,在準備一系列暫存器之後,call *sys_call_table(, %rax, 8)將跳轉到系統呼叫表中的偏移地址,也就是sys_call_table陣列中下標為syscall num對應的函式。

sys_call_table在另一個檔案中定義,這裡用到了一點編譯器擴充套件和預編譯技術的一種高效用法,這裡也不再展開。

/* System call table for x86-64. */
...
#define __SYSCALL_64_QUAL_(sym) sym
#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym

#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
#undef __SYSCALL_64

#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),

extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);

asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 
        /*  
         * Smells like a compiler bug -- it doesn't work
         * when the & below is removed.
         */
        [0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
複製程式碼

什麼時候建立syscall number和sys_unlinkat的對映呢?這要看<asm/syscalls_64.h>,這個標頭檔案是一個過程檔案,在編譯時生成。原對映資訊就是從上文提到的./arch/x86/entry/syscalls/syscall_64.tbl中獲得。

編譯出來的syscalls_64.h結果為:

__SYSCALL_COMMON(49, sys_bind, sys_bind)
__SYSCALL_COMMON(50, sys_listen, sys_listen)
...
__SYSCALL_COMMON(263, sys_unlinkat, sys_unlinkat)
複製程式碼

__SYSCALL_COMMON就是__SYSCALL_64,如上文述sys_call_table的定義,第一個__SYSCALL_64的定義是為了將syscalls_64.h展開為函式宣告,之後將__SYSCALL_64重新定義後,是為了將syscalls_64.h展開為陣列成員的定義。

所以最終核心得到的,是一個只讀的sys_call_table陣列,下標為syscall number,指向的是核心的sys_call_ptr_t。syscall num從0開始,所以直接根據263就可以找到sys_unlinkat。

現在核心已經確定了要呼叫的是sys_unlinkat,那麼這個函式在哪裡定義的呢?經過我的一番嘗試,4.9中直接找sys_unlinkat是找不到實現的,因為這個字串可能經過預編譯粘合。

我最終找到的巨集是這樣定義的:

...
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

#define SYSCALL_DEFINEx(x, sname, ...)                          \
        SYSCALL_METADATA(sname, x, __VA_ARGS__)                 \
        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...)                                 \
        asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \
                __attribute__((alias(__stringify(SyS##name))));         \
        static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));  \
        asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \
        asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \
        {                                                               \
                long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
                __MAP(x,__SC_TEST,__VA_ARGS__);                         \
                __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \
                return ret;                                             \
        }                                                               \
        static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
...
複製程式碼

然後找到,sys_unlinkat的程式碼在fs/namei.c中:

4078 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
4079 {
4080         if ((flag & ~AT_REMOVEDIR) != 0)
4081                 return -EINVAL;
4082 
4083         if (flag & AT_REMOVEDIR)
4084                 return do_rmdir(dfd, pathname);
4085 
4086         return do_unlinkat(dfd, pathname);
4087 }
複製程式碼

然後呼叫do_unlinkat:

3999 /*
4000  * Make sure that the actual truncation of the file will occur outside its
4001  * directory's i_mutex.  Truncate can take a long time if there is a lot of
4002  * writeout happening, and we don't want to prevent access to the directory
4003  * while waiting on the I/O.
4004  */
4005 static long do_unlinkat(int dfd, const char __user *pathname)
4006 {
4007         int error;
4008         struct filename *name;
4009         struct dentry *dentry;
4010         struct path path;
4011         struct qstr last;
4012         int type;
4013         struct inode *inode = NULL;
4014         struct inode *delegated_inode = NULL;
4015         unsigned int lookup_flags = 0;
4016 retry:
4017         name = filename_parentat(dfd, getname(pathname), lookup_flags,
4018                                 &path, &last, &type);
4019         if (IS_ERR(name))
4020                 return PTR_ERR(name);
4021 
4022         error = -EISDIR;
4023         if (type != LAST_NORM)
4024                 goto exit1;
4025 
4026         error = mnt_want_write(path.mnt);
4027         if (error)
4028                 goto exit1;
4029 retry_deleg:
4030         inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
4031         dentry = __lookup_hash(&last, path.dentry, lookup_flags);
4032         error = PTR_ERR(dentry);
4033         if (!IS_ERR(dentry)) {
4034                 /* Why not before? Because we want correct error value */
4035                 if (last.name[last.len])
4036                         goto slashes;
inode = dentry->d_inode;
4038                 if (d_is_negative(dentry))
4039                         goto slashes;
4040                 ihold(inode);
4041                 error = security_path_unlink(&path, dentry);
4042                 if (error)
4043                         goto exit2;
4044                 error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
4045 exit2:
4046                 dput(dentry);
4047         }
4048         inode_unlock(path.dentry->d_inode);
4049         if (inode)
4050                 iput(inode);    /* truncate the inode here */
4051         inode = NULL;
4052         if (delegated_inode) {
4053                 error = break_deleg_wait(&delegated_inode);
4054                 if (!error)
4055                         goto retry_deleg;
4056         }
4057         mnt_drop_write(path.mnt);
4058 exit1:
4059         path_put(&path);
4060         putname(name);
4061         if (retry_estale(error, lookup_flags)) {
4062                 lookup_flags |= LOOKUP_REVAL;
4063                 inode = NULL;
4064                 goto retry;
4065         }
4066         return error;
4067 
4068 slashes:
4069         if (d_is_negative(dentry))
4070                 error = -ENOENT;
4071         else if (d_is_dir(dentry))
4072                 error = -EISDIR;
4073         else
4074                 error = -ENOTDIR;
4075         goto exit2;
4076 }
複製程式碼

好了,讀者隨著我到這一步,已經看到了軟體工程中比較具有美感的一個地方:4044行,呼叫了vfs_unlink。從user space到system call再至此,sys_unlinkat將unlinkat的任務,dispatch給作業系統的虛擬檔案系統。

我們看一下vfs_unlink的實現:

3941 /**
3942  * vfs_unlink - unlink a filesystem object
3943  * @dir:        parent directory
3944  * @dentry:     victim
3945  * @delegated_inode: returns victim inode, if the inode is delegated.
3946  *
3947  * The caller must hold dir->i_mutex.
3948  *
3949  * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
3950  * return a reference to the inode in delegated_inode.  The caller
3951  * should then break the delegation on that inode and retry.  Because
3952  * breaking a delegation may take a long time, the caller should drop
3953  * dir->i_mutex before doing so.
3954  *
3955  * Alternatively, a caller may pass NULL for delegated_inode.  This may
3956  * be appropriate for callers that expect the underlying filesystem not
3957  * to be NFS exported.
3958  */
3959 int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
3960 {
3961         struct inode *target = dentry->d_inode;
3962         int error = may_delete(dir, dentry, 0);
3963 
3964         if (error)
3965                 return error;
3966 
3967         if (!dir->i_op->unlink)
3968                 return -EPERM;
3969 
3970         inode_lock(target);
3971         if (is_local_mountpoint(dentry))
3972                 error = -EBUSY;
3973         else {
3974                 error = security_inode_unlink(dir, dentry);
3975                 if (!error) {
3976                         error = try_break_deleg(target, delegated_inode);
3977                         if (error)
3978                                 goto out;
3979                         error = dir->i_op->unlink(dir, dentry);
3980                         if (!error) {
3981                                 dont_mount(dentry);
3982                                 detach_mounts(dentry);
3983                         }
}
3985         }
3986 out:
3987         inode_unlock(target);
3988 
3989         /* We don't d_delete() NFS sillyrenamed files--they still exist. */
3990         if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
3991                 fsnotify_link_count(target);
3992                 d_delete(dentry);
3993         }
3994 
3995         return error;
3996 }
3997 EXPORT_SYMBOL(vfs_unlink);
複製程式碼

我們看到,3979行,呼叫inode例項中i_op成員的unlink函式指標,這個指標才指向了真正的HAL層實現。

現在看inode結構的定義:

/*
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
 */
struct inode {
        umode_t                 i_mode;
...
        const struct inode_operations   *i_op;
        struct super_block      *i_sb;

        /* Stat data, not accessed from path walking */
        unsigned long           i_ino;
...

#ifdef CONFIG_FSNOTIFY
        __u32                   i_fsnotify_mask; /* all events this inode cares about */
        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
#endif

#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
        struct fscrypt_info     *i_crypt_info;
#endif

        void                    *i_private; /* fs or device private pointer */
};
複製程式碼

可以看到上文的inode例項中的i_op成員是一個inode_operations結構指標。

現在看inode_operations的定義:

struct inode_operations {
        struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
...
        int (*create) (struct inode *,struct dentry *, umode_t, bool);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
        int (*unlink) (struct inode *,struct dentry *);
        int (*symlink) (struct inode *,struct dentry *,const char *);
...
} ____cacheline_aligned;
複製程式碼

vfs下層的各種檔案系統,需要按照inode_operations中的規範,完成unlink的實現,向kernel vfs註冊。

這裡不展開bootloader自舉之後的硬體初始化,也忽略kernel接管機器資源之後的一些register機制,直接看當前機器是怎麼向vfs最終註冊。

看了一下,我機器上掛載的是ext4檔案系統,直接看ext4的unlink的最終註冊過程:

...
3845 /*
3846  * directories can handle most operations...
3847  */
3848 const struct inode_operations ext4_dir_inode_operations = {
...
3851         .link           = ext4_link,
3852         .unlink         = ext4_unlink,
3853         .symlink        = ext4_symlink,
...
3865 }
複製程式碼

ext4_dir_inode_operations例項中,完成了函式指標的賦值。

直接看ext4_unlink的實現:

static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
        int retval;
        struct inode *inode;
        struct buffer_head *bh; 
        struct ext4_dir_entry_2 *de; 
        handle_t *handle = NULL;

        if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
                return -EIO;

        trace_ext4_unlink_enter(dir, dentry);
        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
        retval = dquot_initialize(dir);
        if (retval)
                return retval;
        retval = dquot_initialize(d_inode(dentry));
        if (retval)
                return retval;

        retval = -ENOENT;
        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (!bh)
                goto end_unlink;

        inode = d_inode(dentry);

        retval = -EFSCORRUPTED;
        if (le32_to_cpu(de->inode) != inode->i_ino)
                goto end_unlink;

        handle = ext4_journal_start(dir, EXT4_HT_DIR,
                                    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
                handle = NULL;
                goto end_unlink;
        }    

        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);

        if (inode->i_nlink == 0) { 
                ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
dentry->d_name.len, dentry->d_name.name);
                set_nlink(inode, 1);
        }
        retval = ext4_delete_entry(handle, dir, de, bh);
        if (retval)
                goto end_unlink;
        dir->i_ctime = dir->i_mtime = current_time(dir);
        ext4_update_dx_flag(dir);
        ext4_mark_inode_dirty(handle, dir);
        drop_nlink(inode);
        if (!inode->i_nlink)
                ext4_orphan_add(handle, inode);
        inode->i_ctime = current_time(inode);
        ext4_mark_inode_dirty(handle, inode);

end_unlink:
        brelse(bh);
        if (handle)
                ext4_journal_stop(handle);
        trace_ext4_unlink_exit(dentry, retval);
        return retval;
}
複製程式碼

看d_inode的實現:

static inline struct inode *d_inode(const struct dentry *dentry)
{
	return dentry->d_inode;
}
複製程式碼

d_inode(dentry)將inode資訊從dentry結構中取出來,dentry結構定義如下:

struct dentry {
	/* RCU lookup touched fields */
...
	struct qstr d_name;
	struct inode *d_inode;		/* Where the name belongs to - NULL is
...
	union {
		struct hlist_node d_alias;	/* inode alias list */
		struct hlist_bl_node d_in_lookup_hash;	/* only for in-lookup ones */
	 	struct rcu_head d_rcu;
	} d_u;
};
複製程式碼

dentry這一層,不是簡單的從硬碟中移除。為了高效能,當前ext4對目錄做了一些快取處理。應該是先設定標誌位,然後根據sync機制回寫儲存。

vfs之下的機制就先不詳述了,因為我也不太清楚,蛤蛤。

相關文章