目錄
0. 引言 1. open() syscall 2. close() syscall
0. 引言
在linux的哲學中,所有的磁碟檔案、目錄、外設裝置、驅動裝置全部被抽象為了"檔案"這個概念,所以本文提到的"File IO"適用於linux下所有的IO操作,需要明白的的,本文分析的是linux下的IO系統呼叫對應的核心原始碼,linux下每一個系統呼叫都有對應的核心原始碼,而我們在ring3常用的glib c的程式設計所有的c庫API,它們只是對系統呼叫的一個封裝,最終還是要通過系統呼叫實現功能
0x1: SYSCALL_DEFINE巨集定義
我們在學習核心原始碼的時候經常會遇到一個巨集定義: SYSCALL_DEFINE,所有的系統呼叫的宣告都通過它來實現
\linux-2.6.32.63\include\linux\syscalls.h
#define SYSCALL_DEFINE0(sname) \ SYSCALL_TRACE_ENTER_EVENT(_##sname); \ SYSCALL_TRACE_EXIT_EVENT(_##sname); \ static const struct syscall_metadata __used \ __attribute__((__aligned__(4))) \ __attribute__((section("__syscalls_metadata"))) \ __syscall_meta_##sname = { \ .name = "sys_"#sname, \ .nb_args = 0, \ .enter_event = &event_enter__##sname, \ .exit_event = &event_exit__##sname, \ }; \ asmlinkage long sys_##sname(void) #else #define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) #endif #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
...
#ifdef CONFIG_FTRACE_SYSCALLS #define SYSCALL_DEFINEx(x, sname, ...) \ static const char *types_##sname[] = { \ __SC_STR_TDECL##x(__VA_ARGS__) \ }; \ static const char *args_##sname[] = { \ __SC_STR_ADECL##x(__VA_ARGS__) \ }; \ SYSCALL_METADATA(sname, x); \ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) #else #define SYSCALL_DEFINEx(x, sname, ...) \ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) #endif #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS #define SYSCALL_DEFINE(name) static inline long SYSC_##name #define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \ static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \ asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \ { \ __SC_TEST##x(__VA_ARGS__); \ return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__)); \ } \ SYSCALL_ALIAS(sys##name, SyS##name); \ static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)) #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */ #define SYSCALL_DEFINE(name) asmlinkage long sys_##name #define __SYSCALL_DEFINEx(x, name, ...) asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
所以對函式定義
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)就等於
asmlinkage long sys_socket(int family, int type, int protocol)
Relevant Link:
http://blog.csdn.net/p_panyuch/article/details/5648007
1. open() syscall
open()系統呼叫在kernel中對應的是sys_open()
\linux-2.6.32.63\fs\open.c
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) { long ret; if (force_o_largefile()) { flags |= O_LARGEFILE; } //呼叫do_sys_open完成實際功能 ret = do_sys_open(AT_FDCWD, filename, flags, mode); /* avoid REGPARM breakage on x86: */ asmlinkage_protect(3, ret, filename, flags, mode); return ret; }
繼續跟進do_sys_open()函式
long do_sys_open(int dfd, const char __user *filename, int flags, int mode) { /*獲取檔名稱,由getname()函式完成,其內部首先建立存取檔名稱的空間,然後從使用者空間把檔名拷貝過來*/ char *tmp = getname(filename); int fd = PTR_ERR(tmp); if (!IS_ERR(tmp)) { /*獲取一個可用的fd,此函式呼叫alloc_fd()函式從fd_table中獲取一個可用fd,並進行初始化*/ fd = get_unused_fd_flags(flags); if (fd >= 0) { /*fd獲取成功則開始開啟檔案,此函式是主要完成開啟功能的函式*/ struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); if (IS_ERR(f)) { /*開啟失敗,釋放fd*/ put_unused_fd(fd); fd = PTR_ERR(f); } else { //檔案如果已經被開啟了,呼叫fsnotify_open()函式 fsnotify_open(f->f_path.dentry); //將檔案指標安裝在fd陣列中,每個程式都會將開啟的檔案控制程式碼儲存在fd_array[]陣列中 fd_install(fd, f); } } //釋放放置從使用者空間拷貝過來的檔名的儲存空間 putname(tmp); } return fd; }
繼續跟進do_file_open()函式
/* * Note that the low bits of the passed in "open_flag" * are not the same as in the local variable "flag". See * open_to_namei_flags() for more details. */ struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode) { /* 若干變數宣告 */ struct file *filp; struct nameidata nd; int error; struct path path; struct dentry *dir; int count = 0; int will_write; /*改變引數flag的值,具體做法是flag+1*/ int flag = open_to_namei_flags(open_flag); /*設定訪問許可權*/ if (!acc_mode) { acc_mode = MAY_OPEN | ACC_MODE(flag); } /* O_TRUNC implies we need access checks for write permissions */ /* 根據O_TRUNC標誌設定寫許可權 */ if (flag & O_TRUNC) { acc_mode |= MAY_WRITE; } /* Allow the LSM permission hook to distinguish append access from general write access. */ /* 設定O_APPEND標誌 */ if (flag & O_APPEND) { acc_mode |= MAY_APPEND; } /* The simplest case - just a plain lookup. */ /* 如果不是建立檔案 */ if (!(flag & O_CREAT)) { /* 當核心要訪問一個檔案的時候,第一步要做的是找到這個檔案,而查詢檔案的過程在vfs裡面是由path_lookup或者path_lookup_open函式來完成的 這兩個函式將使用者傳進來的字串表示的檔案路徑轉換成一個dentry結構,並建立好相應的inode和file結構,將指向file的描述符返回使用者 使用者隨後通過檔案描述符,來訪問這些資料結構 */ error = path_lookup_open(dfd, pathname, lookup_flags(flag), &nd, flag); if (error) { return ERR_PTR(error); } goto ok; } /* * Create - we need to know the parent. */ //path-init為查詢作準備工作,path_walk真正上路查詢,這兩個函式聯合起來根據一段路徑名找到對應的dentry error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); if (error) { return ERR_PTR(error); } /* 這個函式相當重要,是整個NFS的名字解析函式,其實也是NFS得以構築的函式 該函式採用一個for迴圈,對name路徑根據目錄的層次,一層一層推進,直到終點或失敗。在推進的過程中,一步步建立了目錄樹的dentry和對應的inode */ error = path_walk(pathname, &nd); if (error) { if (nd.root.mnt) { /*減少dentry和vsmount得計數*/ path_put(&nd.root); } return ERR_PTR(error); } if (unlikely(!audit_dummy_context())) { /*儲存inode節點資訊*/ audit_inode(pathname, nd.path.dentry); } /* * We have the parent and last component. First of all, check * that we are not asked to creat(2) an obvious directory - that * will not do. */ error = -EISDIR; /*父節點資訊*/ if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) { goto exit_parent; } error = -ENFILE; /* 返回特定的file結構體指標 */ filp = get_empty_filp(); if (filp == NULL) { goto exit_parent; } /* 填充nameidata結構 */ nd.intent.open.file = filp; nd.intent.open.flags = flag; nd.intent.open.create_mode = mode; dir = nd.path.dentry; nd.flags &= ~LOOKUP_PARENT; nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN; if (flag & O_EXCL) { nd.flags |= LOOKUP_EXCL; } mutex_lock(&dir->d_inode->i_mutex); /*從雜湊表中查詢nd對應的dentry*/ path.dentry = lookup_hash(&nd); path.mnt = nd.path.mnt; do_last: error = PTR_ERR(path.dentry); if (IS_ERR(path.dentry)) { mutex_unlock(&dir->d_inode->i_mutex); goto exit; } if (IS_ERR(nd.intent.open.file)) { error = PTR_ERR(nd.intent.open.file); goto exit_mutex_unlock; } /* Negative dentry, just create the file */ /*如果此dentry結構沒有對應的inode節點,說明是無效的,應該建立檔案節點 */ if (!path.dentry->d_inode) { /* * This write is needed to ensure that a * ro->rw transition does not occur between * the time when the file is created and when * a permanent write count is taken through * the 'struct file' in nameidata_to_filp(). */ /*write許可權是必需的*/ error = mnt_want_write(nd.path.mnt); if (error) { goto exit_mutex_unlock; } /*按照namei格式的flag open*/ error = __open_namei_create(&nd, &path, flag, mode); if (error) { mnt_drop_write(nd.path.mnt); goto exit; } /*根據nameidata 得到相應的file結構*/ filp = nameidata_to_filp(&nd, open_flag); if (IS_ERR(filp)) { ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); } /*放棄寫許可權*/ mnt_drop_write(nd.path.mnt); if (nd.root.mnt) { /*計數減一*/ path_put(&nd.root); } return filp; } /* * It already exists. */ /*要開啟的檔案已經存在*/ mutex_unlock(&dir->d_inode->i_mutex); /*儲存inode節點*/ audit_inode(pathname, path.dentry); error = -EEXIST; /*flag標誌檢查程式碼*/ if (flag & O_EXCL) { goto exit_dput; } if (__follow_mount(&path)) { error = -ELOOP; if (flag & O_NOFOLLOW) { goto exit_dput; } } error = -ENOENT; if (!path.dentry->d_inode) { goto exit_dput; } if (path.dentry->d_inode->i_op->follow_link) { goto do_link; } /*路徑裝化為相應的nameidata結構*/ path_to_nameidata(&path, &nd); error = -EISDIR; /*如果是資料夾*/ if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) { goto exit; } ok: /* * Consider: * 1. may_open() truncates a file * 2. a rw->ro mount transition occurs * 3. nameidata_to_filp() fails due to * the ro mount. * That would be inconsistent, and should * be avoided. Taking this mnt write here * ensures that (2) can not occur. */ /*檢測是否截斷檔案標誌*/ will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode); if (will_write) { /*要截斷的話就要獲取寫許可權*/ error = mnt_want_write(nd.path.mnt); if (error) { goto exit; } } //may_open執行許可權檢測、檔案開啟和truncate的操作 error = may_open(&nd.path, acc_mode, flag); if (error) { if (will_write) { mnt_drop_write(nd.path.mnt); } goto exit; } filp = nameidata_to_filp(&nd, open_flag); if (IS_ERR(filp)) { ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); } /* * It is now safe to drop the mnt write * because the filp has had a write taken * on its behalf. */ //安全的放棄寫許可權 if (will_write) { mnt_drop_write(nd.path.mnt); } if (nd.root.mnt) { path_put(&nd.root); } return filp; exit_mutex_unlock: mutex_unlock(&dir->d_inode->i_mutex); exit_dput: path_put_conditional(&path, &nd); exit: if (!IS_ERR(nd.intent.open.file)) { release_open_intent(&nd); } exit_parent: if (nd.root.mnt) { path_put(&nd.root); } path_put(&nd.path); return ERR_PTR(error); do_link: //允許遍歷連線檔案,則手工找到連線檔案對應的檔案 error = -ELOOP; if (flag & O_NOFOLLOW) { //不允許遍歷連線檔案,返回錯誤 goto exit_dput; } /* * This is subtle. Instead of calling do_follow_link() we do the * thing by hands. The reason is that this way we have zero link_count * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT. * After that we have the parent and last component, i.e. * we are in the same situation as after the first path_walk(). * Well, almost - if the last component is normal we get its copy * stored in nd->last.name and we will have to putname() it when we * are done. Procfs-like symlinks just set LAST_BIND. */ /* 以下是手工找到連結檔案對應的檔案dentry結構程式碼 */ //設定查詢LOOKUP_PARENT標誌 nd.flags |= LOOKUP_PARENT; //判斷操作是否安全 error = security_inode_follow_link(path.dentry, &nd); if (error) { goto exit_dput; } //處理符號連結 error = __do_follow_link(&path, &nd); if (error) { /* Does someone understand code flow here? Or it is only * me so stupid? Anathema to whoever designed this non-sense * with "intent.open". */ release_open_intent(&nd); if (nd.root.mnt) { path_put(&nd.root); } return ERR_PTR(error); } nd.flags &= ~LOOKUP_PARENT; //檢查最後一段檔案或目錄名的屬性情況 if (nd.last_type == LAST_BIND) { goto ok; } error = -EISDIR; if (nd.last_type != LAST_NORM) { goto exit; } if (nd.last.name[nd.last.len]) { __putname(nd.last.name); goto exit; } error = -ELOOP; //出現迴環標誌: 迴圈超過32次 if (count++==32) { __putname(nd.last.name); goto exit; } dir = nd.path.dentry; mutex_lock(&dir->d_inode->i_mutex); //更新路徑的掛接點和dentry path.dentry = lookup_hash(&nd); path.mnt = nd.path.mnt; __putname(nd.last.name); goto do_last; }
總結一下流程
1. open系統呼叫訪問SYSCALL_DEFINE3函式 2. 在open系統呼叫中,呼叫do_sys_open函式完成主要功能 3. 在do_sys_open函式中,呼叫函式do_filp_open完成主要的開啟功能 4. 在核心中要開啟一個檔案,首先應該找到這個檔案,而查詢檔案的過程在vfs裡面是由do_path_lookup或者path_lookup_open函式來完成的 4.1 設定nd->root=根路徑(絕對地址)或者當前工作目錄(相對地址) 4.2 這一步做完了後,核心會建立一些資料結構(dentry,inode)來初始化查詢的起點 if(!retval){ retval = path_walk(name,nd);} 4.3 path_walk會遍歷路徑的每一節點分量,也就是用"/"分隔開的每一部分,最終找到name指向的檔案 int path_walk(const char *name,struct nameidata *nd) { return link_path_walk(name,nd); //path_walk其實相當於直接呼叫link_path_walk來完成工作 } 4.4 link_path_walk的主要工作是有其內部函式__link_path_walk 來完成的 result = __link_path_walk(name,nd) 4.5 __link_walk_path,該函式把傳進來的字串name,也就是使用者指定的路徑,按路徑分隔符分解成一系列小的component。比如使用者說,我要找"/path/to/dest"這個檔案,那麼我們的檔案系統就會按path、to、dest一個
一個來找,知道最後一個分量是檔案或者查詢完成。他找的時候,會先用path_init初始化過的根路徑去找第一個分量,也就是path。然後用path的dentry->d_inode去找to,這樣迴圈到最後一個。注意,核心會快取找到的路徑分量,
所以往往只有第一次訪問一個路徑的時候,才會去訪問磁碟,後面的訪問會直接從快取裡找,下面會看到,很多與頁告訴快取打交道的程式碼。但不管怎樣,第一遍查詢總是會訪問磁碟的 static int __link_path_walk(const char *name,strucy nameidata *nd){..} 至此,按照每一個component查詢完成之後,就會找到相應的檔案,然後相應的開啟工作就基本完成了
Relevant Link:
http://oss.org.cn/kernel-book/ http://blog.csdn.net/f413933206/article/details/5701913
2. close() syscall
close()系統呼叫對應核心中的函式為: sys_close()
\linux-2.6.32.63\fs\open.c
/* * Careful here! We test whether the file pointer is NULL before * releasing the fd. This ensures that one clone task can't release * an fd while another clone is opening it. */ SYSCALL_DEFINE1(close, unsigned int, fd) { struct file * filp; struct files_struct *files = current->files; struct fdtable *fdt; int retval; spin_lock(&files->file_lock); /* 獲取指向struct fdtable結構體的指標 \linux-2.6.32.63\include\linux\fdtable.h #define files_fdtable(files) (rcu_dereference((files)->fdt)) */ fdt = files_fdtable(files); if (fd >= fdt->max_fds) { goto out_unlock; } //獲取需要關閉的檔案描述符編號 filp = fdt->fd[fd]; if (!filp) { goto out_unlock; } /* 將fd_array[]中的的指定元素值置null */ rcu_assign_pointer(fdt->fd[fd], NULL); FD_CLR(fd, fdt->close_on_exec); /* 呼叫__put_unused_fd函式,將當前fd回收,則下一次開啟新的檔案又可以用這個fd了 static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __FD_CLR(fd, fdt->open_fds); if (fd < files->next_fd) { files->next_fd = fd; } } */ __put_unused_fd(files, fd); spin_unlock(&files->file_lock); retval = filp_close(filp, files); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK)) { retval = -EINTR; } return retval; out_unlock: spin_unlock(&files->file_lock); return -EBADF; } EXPORT_SYMBOL(sys_close);
對於,我們需要重點跟進2個函式: rcu_assign_pointer(fdt->fd[fd], NULL);、retval = filp_close(filp, files);
\linux-2.6.32.63\fs\rcupdate.h
/** * rcu_assign_pointer - assign (publicize) a pointer to a newly * initialized structure that will be dereferenced by RCU read-side * critical sections. Returns the value assigned. * * Inserts memory barriers on architectures that require them * (pretty much all of them other than x86), and also prevents * the compiler from reordering the code that initializes the * structure after the pointer assignment. More importantly, this * call documents which pointers will be dereferenced by RCU read-side * code. */ #define rcu_assign_pointer(p, v) \ ({ \ if (!__builtin_constant_p(v) || \ ((v) != NULL)) \ smp_wmb(); \ (p) = (v); \ })
我們知道,每個程式在kernel中都有一個對應的task_struct與之對應,而通過task_struct可以間接地獲得一個fd_array[]陣列,表示當前程式已經開啟的檔案,每一個元素都是一個檔案描述符的值,只有通過這個fd_array[x]才能獲取當前程式開啟的檔案的struc file*,而rcu_assign_pointer(fdt->fd[fd], NULL)的作用就在於將將這個陣列的指定元素置空,即斷開了這個引用的關係,至於之後核心棧中的那個struct file*是否釋放,那記憶體回收的事,至少現在程式想通過task_stuct是無法再引用到之前開啟過的檔案了,這裡面的關係圖可以參閱:
http://www.cnblogs.com/LittleHann/p/3865490.html //搜尋: 用一張圖表示task_struct、fs_struct、files_struct、fdtable、file的關係
我們繼續分析etval = filp_close(filp, files);
\linux-2.6.32.63\fs\open.c
/* * "id" is the POSIX thread ID. We use the * files pointer for this.. */ int filp_close(struct file *filp, fl_owner_t id) { int retval = 0; if (!file_count(filp)) { printk(KERN_ERR "VFS: Close: file count is 0\n"); return 0; } if (filp->f_op && filp->f_op->flush) { retval = filp->f_op->flush(filp, id); } dnotify_flush(filp, id); locks_remove_posix(filp, id); fput(filp); return retval; }
filp_close()負責將表示開啟的檔案的struct file*記憶體空間進行釋放,至此,核心棧中就再也沒有之前開啟過的檔案的任何痕跡了
Relevant Link:
http://blog.csdn.net/ce123_zhouwei/article/details/8459794
Copyright (c) 2014 LittleHann All rights reserved