Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)

Andrew.Hann發表於2014-08-24

目錄

0. 引言
1. open() syscall
2. close() syscall

 

0. 引言

在linux的哲學中,所有的磁碟檔案、目錄、外設裝置、驅動裝置全部被抽象為了"檔案"這個概念,所以本文提到的"File IO"適用於linux下所有的IO操作,需要明白的的,本文分析的是linux下的IO系統呼叫對應的核心原始碼,linux下每一個系統呼叫都有對應的核心原始碼,而我們在ring3常用的glib c的程式設計所有的c庫API,它們只是對系統呼叫的一個封裝,最終還是要通過系統呼叫實現功能

0x1: SYSCALL_DEFINE巨集定義

我們在學習核心原始碼的時候經常會遇到一個巨集定義: SYSCALL_DEFINE,所有的系統呼叫的宣告都通過它來實現

\linux-2.6.32.63\include\linux\syscalls.h

#define SYSCALL_DEFINE0(sname)                    \
    SYSCALL_TRACE_ENTER_EVENT(_##sname);            \
    SYSCALL_TRACE_EXIT_EVENT(_##sname);            \
    static const struct syscall_metadata __used        \
      __attribute__((__aligned__(4)))            \
      __attribute__((section("__syscalls_metadata")))    \
      __syscall_meta_##sname = {                \
        .name         = "sys_"#sname,            \
        .nb_args     = 0,                \
        .enter_event    = &event_enter__##sname,    \
        .exit_event    = &event_exit__##sname,        \
    };                            \
    asmlinkage long sys_##sname(void)
#else
    #define SYSCALL_DEFINE0(name)       asmlinkage long sys_##name(void)
#endif

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

...

#ifdef CONFIG_FTRACE_SYSCALLS
    #define SYSCALL_DEFINEx(x, sname, ...)                \
        static const char *types_##sname[] = {            \
            __SC_STR_TDECL##x(__VA_ARGS__)            \
        };                            \
        static const char *args_##sname[] = {            \
            __SC_STR_ADECL##x(__VA_ARGS__)            \
        };                            \
        SYSCALL_METADATA(sname, x);                \
        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#else
    #define SYSCALL_DEFINEx(x, sname, ...)                \
        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#endif

#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
    #define SYSCALL_DEFINE(name) static inline long SYSC_##name
    #define __SYSCALL_DEFINEx(x, name, ...)                    \
    asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));        \
    static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));    \
    asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))        \
    {                                \
        __SC_TEST##x(__VA_ARGS__);                \
        return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__));    \
    }                                \
    SYSCALL_ALIAS(sys##name, SyS##name);                \
    static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
    #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
    #define __SYSCALL_DEFINEx(x, name, ...) asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */

所以對函式定義

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)就等於
asmlinkage long sys_socket(int family, int type, int protocol)

Relevant Link:

http://blog.csdn.net/p_panyuch/article/details/5648007

 

1. open() syscall

open()系統呼叫在kernel中對應的是sys_open()

\linux-2.6.32.63\fs\open.c

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
{
    long ret;

    if (force_o_largefile())
    {
        flags |= O_LARGEFILE;
    } 

    //呼叫do_sys_open完成實際功能
    ret = do_sys_open(AT_FDCWD, filename, flags, mode);
    /* avoid REGPARM breakage on x86: */
    asmlinkage_protect(3, ret, filename, flags, mode);
    return ret;
}

繼續跟進do_sys_open()函式

long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
    /*獲取檔名稱,由getname()函式完成,其內部首先建立存取檔名稱的空間,然後從使用者空間把檔名拷貝過來*/
    char *tmp = getname(filename);
    int fd = PTR_ERR(tmp);

    if (!IS_ERR(tmp)) 
    {
        /*獲取一個可用的fd,此函式呼叫alloc_fd()函式從fd_table中獲取一個可用fd,並進行初始化*/
        fd = get_unused_fd_flags(flags);
        if (fd >= 0) 
        {
            /*fd獲取成功則開始開啟檔案,此函式是主要完成開啟功能的函式*/
            struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
            if (IS_ERR(f)) 
            {
                /*開啟失敗,釋放fd*/
                put_unused_fd(fd);
                fd = PTR_ERR(f);
            } 
            else 
            {
                //檔案如果已經被開啟了,呼叫fsnotify_open()函式 
                fsnotify_open(f->f_path.dentry);
                //將檔案指標安裝在fd陣列中,每個程式都會將開啟的檔案控制程式碼儲存在fd_array[]陣列中
                fd_install(fd, f);
            }
        }
        //釋放放置從使用者空間拷貝過來的檔名的儲存空間 
        putname(tmp);
    }
    return fd;
}

繼續跟進do_file_open()函式

/*
 * Note that the low bits of the passed in "open_flag"
 * are not the same as in the local variable "flag". See
 * open_to_namei_flags() for more details.
 */
struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode)
{
    /* 若干變數宣告 */
    struct file *filp;
    struct nameidata nd;
    int error;
    struct path path;
    struct dentry *dir;
    int count = 0;
    int will_write;
    /*改變引數flag的值,具體做法是flag+1*/
    int flag = open_to_namei_flags(open_flag);
    /*設定訪問許可權*/
    if (!acc_mode)
    {
        acc_mode = MAY_OPEN | ACC_MODE(flag);
    } 

    /* O_TRUNC implies we need access checks for write permissions */
    /* 根據O_TRUNC標誌設定寫許可權 */
    if (flag & O_TRUNC)
    {
        acc_mode |= MAY_WRITE;
    } 

    /* Allow the LSM permission hook to distinguish append access from general write access. */
    /* 設定O_APPEND標誌 */
    if (flag & O_APPEND)
    {
        acc_mode |= MAY_APPEND;
    } 

    /* The simplest case - just a plain lookup. */
    /* 如果不是建立檔案 */
    if (!(flag & O_CREAT)) 
    { 
        /*
        當核心要訪問一個檔案的時候,第一步要做的是找到這個檔案,而查詢檔案的過程在vfs裡面是由path_lookup或者path_lookup_open函式來完成的
        這兩個函式將使用者傳進來的字串表示的檔案路徑轉換成一個dentry結構,並建立好相應的inode和file結構,將指向file的描述符返回使用者
        使用者隨後通過檔案描述符,來訪問這些資料結構
        */
        error = path_lookup_open(dfd, pathname, lookup_flags(flag), &nd, flag);
        if (error)
        {
            return ERR_PTR(error);
        } 
        goto ok;
    }

    /*
     * Create - we need to know the parent.
     */
    //path-init為查詢作準備工作,path_walk真正上路查詢,這兩個函式聯合起來根據一段路徑名找到對應的dentry  
    error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
    if (error)
    {
        return ERR_PTR(error);
    } 
    /*
    這個函式相當重要,是整個NFS的名字解析函式,其實也是NFS得以構築的函式
    該函式採用一個for迴圈,對name路徑根據目錄的層次,一層一層推進,直到終點或失敗。在推進的過程中,一步步建立了目錄樹的dentry和對應的inode
    */
    error = path_walk(pathname, &nd);
    if (error) 
    {
        if (nd.root.mnt)
        {
            /*減少dentry和vsmount得計數*/
            path_put(&nd.root);
        } 
        return ERR_PTR(error);
    }
    if (unlikely(!audit_dummy_context()))
    {
        /*儲存inode節點資訊*/
        audit_inode(pathname, nd.path.dentry);
    } 

    /*
     * We have the parent and last component. First of all, check
     * that we are not asked to creat(2) an obvious directory - that
     * will not do.
     */
    error = -EISDIR;
    /*父節點資訊*/
    if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
    {
        goto exit_parent;
    } 

    error = -ENFILE;
    /* 返回特定的file結構體指標 */
    filp = get_empty_filp();
    if (filp == NULL)
    {
        goto exit_parent;
    } 
    /* 填充nameidata結構 */
    nd.intent.open.file = filp;
    nd.intent.open.flags = flag;
    nd.intent.open.create_mode = mode;
    dir = nd.path.dentry;
    nd.flags &= ~LOOKUP_PARENT;
    nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
    if (flag & O_EXCL)
    {
        nd.flags |= LOOKUP_EXCL;
    } 
    mutex_lock(&dir->d_inode->i_mutex);
    /*從雜湊表中查詢nd對應的dentry*/
    path.dentry = lookup_hash(&nd);
    path.mnt = nd.path.mnt;

do_last:
    error = PTR_ERR(path.dentry);
    if (IS_ERR(path.dentry)) 
    {
        mutex_unlock(&dir->d_inode->i_mutex);
        goto exit;
    }

    if (IS_ERR(nd.intent.open.file)) 
    {
        error = PTR_ERR(nd.intent.open.file);
        goto exit_mutex_unlock;
    }

    /* Negative dentry, just create the file */
    /*如果此dentry結構沒有對應的inode節點,說明是無效的,應該建立檔案節點 */
    if (!path.dentry->d_inode) 
    {
        /*
         * This write is needed to ensure that a
         * ro->rw transition does not occur between
         * the time when the file is created and when
         * a permanent write count is taken through
         * the 'struct file' in nameidata_to_filp().
        */
        /*write許可權是必需的*/
        error = mnt_want_write(nd.path.mnt);
        if (error)
        {
            goto exit_mutex_unlock;
        } 
        /*按照namei格式的flag open*/
        error = __open_namei_create(&nd, &path, flag, mode);
        if (error) 
        {
            mnt_drop_write(nd.path.mnt);
            goto exit;
        }
        /*根據nameidata 得到相應的file結構*/
        filp = nameidata_to_filp(&nd, open_flag);
        if (IS_ERR(filp))
        {
            ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
        } 
        /*放棄寫許可權*/
        mnt_drop_write(nd.path.mnt);
        if (nd.root.mnt)
        {
            /*計數減一*/
            path_put(&nd.root);
        } 
        return filp;
    }

    /*
     * It already exists.
     */
    /*要開啟的檔案已經存在*/
    mutex_unlock(&dir->d_inode->i_mutex);
    /*儲存inode節點*/
    audit_inode(pathname, path.dentry);

    error = -EEXIST;
    /*flag標誌檢查程式碼*/
    if (flag & O_EXCL)
    {
        goto exit_dput;
    } 

    if (__follow_mount(&path))
    {
        error = -ELOOP;
        if (flag & O_NOFOLLOW)
        {
            goto exit_dput;
        } 
    }

    error = -ENOENT;
    if (!path.dentry->d_inode)
    {
        goto exit_dput;
    } 
    if (path.dentry->d_inode->i_op->follow_link)
    {
        goto do_link;
    } 
    /*路徑裝化為相應的nameidata結構*/
    path_to_nameidata(&path, &nd);
    error = -EISDIR;
    /*如果是資料夾*/
    if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
    {
        goto exit;
    } 
ok:
    /*
     * Consider:
     * 1. may_open() truncates a file
     * 2. a rw->ro mount transition occurs
     * 3. nameidata_to_filp() fails due to
     *    the ro mount.
     * That would be inconsistent, and should
     * be avoided. Taking this mnt write here
     * ensures that (2) can not occur.
     */
    /*檢測是否截斷檔案標誌*/
    will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
    if (will_write) 
    {
        /*要截斷的話就要獲取寫許可權*/
        error = mnt_want_write(nd.path.mnt);
        if (error)
        {
            goto exit;
        } 
    }
    //may_open執行許可權檢測、檔案開啟和truncate的操作
    error = may_open(&nd.path, acc_mode, flag);
    if (error) 
    {
        if (will_write)
        {
            mnt_drop_write(nd.path.mnt);
        } 
        goto exit;
    }
    filp = nameidata_to_filp(&nd, open_flag);
    if (IS_ERR(filp))
    {
        ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
    }
        
    /*
     * It is now safe to drop the mnt write
     * because the filp has had a write taken
     * on its behalf.
     */
    //安全的放棄寫許可權
    if (will_write)
    {
        mnt_drop_write(nd.path.mnt);
    } 
    if (nd.root.mnt)
    {
        path_put(&nd.root);
    } 
    return filp;

exit_mutex_unlock:
    mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
    path_put_conditional(&path, &nd);
exit:
    if (!IS_ERR(nd.intent.open.file))
    {
        release_open_intent(&nd);
    }
        
exit_parent:
    if (nd.root.mnt)
    {
        path_put(&nd.root);
    } 
    path_put(&nd.path);
    return ERR_PTR(error);

do_link:
//允許遍歷連線檔案,則手工找到連線檔案對應的檔案
    error = -ELOOP;
    if (flag & O_NOFOLLOW)
    {
        //不允許遍歷連線檔案,返回錯誤
        goto exit_dput;
    } 
    /*
     * This is subtle. Instead of calling do_follow_link() we do the
     * thing by hands. The reason is that this way we have zero link_count
     * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
     * After that we have the parent and last component, i.e.
     * we are in the same situation as after the first path_walk().
     * Well, almost - if the last component is normal we get its copy
     * stored in nd->last.name and we will have to putname() it when we
     * are done. Procfs-like symlinks just set LAST_BIND.
     */
    /* 以下是手工找到連結檔案對應的檔案dentry結構程式碼 */

    //設定查詢LOOKUP_PARENT標誌
    nd.flags |= LOOKUP_PARENT;
    //判斷操作是否安全
    error = security_inode_follow_link(path.dentry, &nd);
    if (error)
    {
        goto exit_dput;
    } 
    //處理符號連結
    error = __do_follow_link(&path, &nd);
    if (error) 
    {
        /* Does someone understand code flow here? Or it is only
         * me so stupid? Anathema to whoever designed this non-sense
         * with "intent.open".
         */
        release_open_intent(&nd);
        if (nd.root.mnt)
        {
            path_put(&nd.root);
        } 
        return ERR_PTR(error);
    }
    nd.flags &= ~LOOKUP_PARENT;
    //檢查最後一段檔案或目錄名的屬性情況
    if (nd.last_type == LAST_BIND)
    {
        goto ok;
    } 
    error = -EISDIR;
    if (nd.last_type != LAST_NORM)
    {
        goto exit;
    } 
    if (nd.last.name[nd.last.len]) 
    {
        __putname(nd.last.name);
        goto exit;
    }
    error = -ELOOP;
    //出現迴環標誌: 迴圈超過32次
    if (count++==32) 
    {
        __putname(nd.last.name);
        goto exit;
    }
    dir = nd.path.dentry;
    mutex_lock(&dir->d_inode->i_mutex);
    //更新路徑的掛接點和dentry
    path.dentry = lookup_hash(&nd);
    path.mnt = nd.path.mnt;
    __putname(nd.last.name);
    goto do_last;
}

總結一下流程

1. open系統呼叫訪問SYSCALL_DEFINE3函式
2. 在open系統呼叫中,呼叫do_sys_open函式完成主要功能
3. 在do_sys_open函式中,呼叫函式do_filp_open完成主要的開啟功能
4. 在核心中要開啟一個檔案,首先應該找到這個檔案,而查詢檔案的過程在vfs裡面是由do_path_lookup或者path_lookup_open函式來完成的
    4.1 設定nd->root=根路徑(絕對地址)或者當前工作目錄(相對地址)
    4.2 這一步做完了後,核心會建立一些資料結構(dentry,inode)來初始化查詢的起點
    if(!retval){ retval = path_walk(name,nd);}
    4.3 path_walk會遍歷路徑的每一節點分量,也就是用"/"分隔開的每一部分,最終找到name指向的檔案 
    int path_walk(const char *name,struct nameidata *nd)
    {
        return link_path_walk(name,nd);
        //path_walk其實相當於直接呼叫link_path_walk來完成工作
    }
    4.4 link_path_walk的主要工作是有其內部函式__link_path_walk 來完成的
        result = __link_path_walk(name,nd)
    4.5 __link_walk_path,該函式把傳進來的字串name,也就是使用者指定的路徑,按路徑分隔符分解成一系列小的component。比如使用者說,我要找"/path/to/dest"這個檔案,那麼我們的檔案系統就會按path、to、dest一個
一個來找,知道最後一個分量是檔案或者查詢完成。他找的時候,會先用path_init初始化過的根路徑去找第一個分量,也就是path。然後用path的dentry->d_inode去找to,這樣迴圈到最後一個。注意,核心會快取找到的路徑分量,
所以往往只有第一次訪問一個路徑的時候,才會去訪問磁碟,後面的訪問會直接從快取裡找,下面會看到,很多與頁告訴快取打交道的程式碼。但不管怎樣,第一遍查詢總是會訪問磁碟的
static int __link_path_walk(const char *name,strucy nameidata *nd){..} 至此,按照每一個component查詢完成之後,就會找到相應的檔案,然後相應的開啟工作就基本完成了

Relevant Link:

http://oss.org.cn/kernel-book/
http://blog.csdn.net/f413933206/article/details/5701913

 

2. close() syscall

close()系統呼叫對應核心中的函式為: sys_close()

\linux-2.6.32.63\fs\open.c

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
    struct file * filp;
    struct files_struct *files = current->files;
    struct fdtable *fdt;
    int retval;

    spin_lock(&files->file_lock);
    /*
    獲取指向struct fdtable結構體的指標
    \linux-2.6.32.63\include\linux\fdtable.h
    #define files_fdtable(files) (rcu_dereference((files)->fdt))
    */
    fdt = files_fdtable(files);
    if (fd >= fdt->max_fds)
    {
        goto out_unlock;
    } 
    //獲取需要關閉的檔案描述符編號
    filp = fdt->fd[fd];
    if (!filp)
    {
        goto out_unlock;
    } 
    /*
    將fd_array[]中的的指定元素值置null 
    */
    rcu_assign_pointer(fdt->fd[fd], NULL);
    FD_CLR(fd, fdt->close_on_exec); 
    /*
    呼叫__put_unused_fd函式,將當前fd回收,則下一次開啟新的檔案又可以用這個fd了
    static void __put_unused_fd(struct files_struct *files, unsigned int fd)
    {
        struct fdtable *fdt = files_fdtable(files);
        __FD_CLR(fd, fdt->open_fds);
        if (fd < files->next_fd)
        {
            files->next_fd = fd;
        } 
    }
    */
    __put_unused_fd(files, fd);
    spin_unlock(&files->file_lock);
    retval = filp_close(filp, files);

    /* can't restart close syscall because file table entry was cleared */
    if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK))
    {
        retval = -EINTR;
    } 

    return retval;

out_unlock:
    spin_unlock(&files->file_lock);
    return -EBADF;
}
EXPORT_SYMBOL(sys_close);

對於,我們需要重點跟進2個函式: rcu_assign_pointer(fdt->fd[fd], NULL);、retval = filp_close(filp, files);

\linux-2.6.32.63\fs\rcupdate.h

/**
 * rcu_assign_pointer - assign (publicize) a pointer to a newly
 * initialized structure that will be dereferenced by RCU read-side
 * critical sections.  Returns the value assigned.
 *
 * Inserts memory barriers on architectures that require them
 * (pretty much all of them other than x86), and also prevents
 * the compiler from reordering the code that initializes the
 * structure after the pointer assignment.  More importantly, this
 * call documents which pointers will be dereferenced by RCU read-side
 * code.
 */

#define rcu_assign_pointer(p, v) \
    ({ \
        if (!__builtin_constant_p(v) || \
            ((v) != NULL)) \
            smp_wmb(); \
        (p) = (v); \
    })

我們知道,每個程式在kernel中都有一個對應的task_struct與之對應,而通過task_struct可以間接地獲得一個fd_array[]陣列,表示當前程式已經開啟的檔案,每一個元素都是一個檔案描述符的值,只有通過這個fd_array[x]才能獲取當前程式開啟的檔案的struc file*,而rcu_assign_pointer(fdt->fd[fd], NULL)的作用就在於將將這個陣列的指定元素置空,即斷開了這個引用的關係,至於之後核心棧中的那個struct file*是否釋放,那記憶體回收的事,至少現在程式想通過task_stuct是無法再引用到之前開啟過的檔案了,這裡面的關係圖可以參閱:

http://www.cnblogs.com/LittleHann/p/3865490.html
//搜尋: 用一張圖表示task_struct、fs_struct、files_struct、fdtable、file的關係

我們繼續分析etval = filp_close(filp, files);

\linux-2.6.32.63\fs\open.c

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
int filp_close(struct file *filp, fl_owner_t id)
{
    int retval = 0;

    if (!file_count(filp)) 
    {
        printk(KERN_ERR "VFS: Close: file count is 0\n");
        return 0;
    }

    if (filp->f_op && filp->f_op->flush)
    {
        retval = filp->f_op->flush(filp, id);
    } 

    dnotify_flush(filp, id);
    locks_remove_posix(filp, id);
    fput(filp);
    return retval;
}

filp_close()負責將表示開啟的檔案的struct file*記憶體空間進行釋放,至此,核心棧中就再也沒有之前開啟過的檔案的任何痕跡了

Relevant Link:

http://blog.csdn.net/ce123_zhouwei/article/details/8459794

 

Copyright (c) 2014 LittleHann All rights reserved

 

相關文章