Linux SYNC-fs

王二車發表於2019-02-21

sync同步buffer和cache資料到disk裝置。

使用情景

1、Linux命令列下執行sync命令

2、呼叫庫函式sync

3、Linux命令列下執行reboot或poweroff會走sync,-n引數Do not sync

程式碼分析

sync系統呼叫

SYSCALL_DEFINE0(sync)
{
	int nowait = 0, wait = 1;

	wakeup_flusher_threads(0, WB_REASON_SYNC); 喚醒flusher重新整理執行緒進行同步
	iterate_supers(sync_inodes_one_sb, NULL);
	iterate_supers(sync_fs_one_sb, &nowait);
	iterate_supers(sync_fs_one_sb, &wait);
	iterate_bdevs(fdatawrite_one_bdev, NULL);
	iterate_bdevs(fdatawait_one_bdev, NULL);
	if (unlikely(laptop_mode))
		laptop_sync_completion();
	return 0;
}

喚醒重新整理佇列執行緒

wakeup_flusher_threads(long nr_pages, enum wb_reason reason),nr_pages為0表示需要回寫cache裡面所有的資料,會把cache裡面所有的髒頁找出,並加入到回寫執行緒中。

/*
 * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
 * the whole world.
 */
void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
{
	struct backing_dev_info *bdi;

	if (!nr_pages)
		nr_pages = get_nr_dirty_pages(); 獲取page cache中所有的髒頁

	rcu_read_lock();
	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
		if (!bdi_has_dirty_io(bdi))
			continue;
		__bdi_start_writeback(bdi, nr_pages, false, reason); 啟動回寫佇列函式
	}
	rcu_read_unlock();
}

啟動回寫佇列函式。

static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
		      bool range_cyclic, enum wb_reason reason)
{
	struct wb_writeback_work *work;

	/*
	 * This is WB_SYNC_NONE writeback, so if allocation fails just
	 * wakeup the thread for old dirty data writeback
	 */
	work = kzalloc(sizeof(*work), GFP_ATOMIC);
	if (!work) {
		trace_writeback_nowork(bdi);
		bdi_wakeup_thread(bdi);
		return;
	}

	work->sync_mode	= WB_SYNC_NONE;
	work->nr_pages	= nr_pages;
	work->range_cyclic = range_cyclic;
	work->reason	= reason;

	bdi_queue_work(bdi, work);
}

建立並初始化一個新的回寫佇列函式,最後加入到work list中。

同步系統中所有檔案系統下面的檔案

查詢系統中所有檔案系統並執行f函式:

void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
{
	struct super_block *sb, *p = NULL;

	spin_lock(&sb_lock);
	list_for_each_entry(sb, &super_blocks, s_list) {
		if (hlist_unhashed(&sb->s_instances))
			continue;
		sb->s_count++;
		spin_unlock(&sb_lock);

		down_read(&sb->s_umount);
		if (sb->s_root && (sb->s_flags & MS_BORN))
			f(sb, arg);

一般情況下更新超級塊有幾種情況:sync命令,umount命令,mount命令。

static void sync_inodes_one_sb(struct super_block *sb, void *arg)
{
	if (!(sb->s_flags & MS_RDONLY))
		sync_inodes_sb(sb);
}

建立一個佇列並加入work list,然後就去等待檔案系統中檔案被重新整理完成: 

/**
 * sync_inodes_sb	-	sync sb inode pages
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
 * super_block.
 */
void sync_inodes_sb(struct super_block *sb)
{
	DECLARE_COMPLETION_ONSTACK(done);
	struct wb_writeback_work work = {
		.sb		= sb,
		.sync_mode	= WB_SYNC_ALL,
		.nr_pages	= LONG_MAX,
		.range_cyclic	= 0,
		.done		= &done,
		.reason		= WB_REASON_SYNC,
		.for_sync	= 1,
	};

	/* Nothing to do? */
	if (sb->s_bdi == &noop_backing_dev_info)
		return;
	WARN_ON(!rwsem_is_locked(&sb->s_umount));

	bdi_queue_work(sb->s_bdi, &work);
	wait_for_completion(&done);

	wait_sb_inodes(sb);
}

sb表示超級塊也就是一個型別的檔案系統,inode表示一個檔案,檔案的資料塊離散的地址使用i_mapping表示:

static void wait_sb_inodes(struct super_block *sb)
{
	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 搜尋檔案系統中所有檔案
		struct address_space *mapping = inode->i_mapping; 檔案的原始資料塊地址空間

		spin_lock(&inode->i_lock);
		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
		    (mapping->nrpages == 0)) {
			spin_unlock(&inode->i_lock);
			continue;
		}

		filemap_fdatawait(mapping); 等待cache中的資料塊被重新整理完成
                    filemap_fdatawait_range

 filemap_fdatawait_range函式就是搜尋具體某個檔案所有在cache中的資料是否被重新整理完成,通過mapping把所有的pages全部找出來,然後通過wait_on_page_writeback函式檢查每一個page是否被重新整理完成,完成則被喚醒繼續檢查下一個page,否則進入不可中斷的D狀態等待被喚醒。

int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
			    loff_t end_byte)
{
	pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
	pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
	struct pagevec pvec;
	int nr_pages;
	int ret2, ret = 0;

	if (end_byte < start_byte)
		goto out;

	pagevec_init(&pvec, 0);
	while ((index <= end) &&
			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
			PAGECACHE_TAG_WRITEBACK,
			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
		unsigned i;

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			/* until radix tree lookup accepts end_index */
			if (page->index > end)
				continue;

			wait_on_page_writeback(page);
			if (TestClearPageError(page))
				ret = -EIO;
		}
		pagevec_release(&pvec);
		cond_resched();
	}
out:
	ret2 = filemap_check_errors(mapping);
	if (!ret)
		ret = ret2;

	return ret;
}

先分析到這兒,wait_on_page_writeback函式今天io_schedule暫不分析。

wait_sb_inodes(sb);查詢superblock中所有已經使用過的inode對應的mapping資料塊是否被page佇列重新整理完成,重新整理完成則被佇列喚醒,否則進入不可中斷的D狀態進行阻塞等待。

多程式sync測試

開6個程式分別寫16M檔案,寫完檔案後支援sync同步,驗證當中sync進行時又有程式往cache裡面寫資料,會導致sync阻塞卡住時間很長,且一個sync卡住,其他sync都會卡住。

程式指令碼:n表示程式index,a表示檔名遞增;

n=1;a=1;while true;do dd if=/dev/zero of=$n-$a.img bs=1M count=16 2>/dev/null;sync;echo $n="$a";a=`expr $a + 1`;done &

ps檢視進行狀態資訊:

31118 root      2732 D    sync
31119 root      2732 D    sync
31122 root      2732 D    sync
31123 root      2732 D    sync
31126 root      2732 D    sync
31131 root      2732 D    sync

13298 root      2732 D    sync
13302 root      2732 D    sync
13306 root      2732 D    sync
13314 root      2732 D    sync
13320 root      2732 D    sync
13576 root      2732 D    sync

實驗總結

只要一個sync卡住,後面的都會卡住,第一個sync lock了cache裡面的inodes,後面的sync會阻塞等待進行釋放並檢查回寫,

每個執行緒或程式寫完檔案後馬上呼叫sync同步cache資料,在單程式情況下不會影響太大,在多程式情況下,寫入速度會很慢,且會阻塞其他程式。

同步檔案系統資料

	iterate_supers(sync_fs_one_sb, &nowait);
	iterate_supers(sync_fs_one_sb, &wait);
static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
	if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs) 不是隻讀檔案系統且有同步函式
		sb->s_op->sync_fs(sb, *(int *)arg);
}

ext4檔案的sync_fs賦值:

static const struct super_operations ext4_sops = {
	.sync_fs	= ext4_sync_fs,
};
static int ext4_sync_fs(struct super_block *sb, int wait)
{
	int ret = 0;
	tid_t target;
	bool needs_barrier = false;
	struct ext4_sb_info *sbi = EXT4_SB(sb);

	trace_ext4_sync_fs(sb, wait);
	flush_workqueue(sbi->rsv_conversion_wq);
	/*
	 * Writeback quota in non-journalled quota case - journalled quota has
	 * no dirty dquots
	 */
	dquot_writeback_dquots(sb, -1);
	/*
	 * Data writeback is possible w/o journal transaction, so barrier must
	 * being sent at the end of the function. But we can skip it if
	 * transaction_commit will do it for us.
	 */
	if (sbi->s_journal) {
		target = jbd2_get_latest_transaction(sbi->s_journal);
		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
			needs_barrier = true;

		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
			if (wait)
				ret = jbd2_log_wait_commit(sbi->s_journal,
							   target);
		}
	} else if (wait && test_opt(sb, BARRIER))
		needs_barrier = true;
	if (needs_barrier) {
		int err;
		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
		if (!ret)
			ret = err;
	}

	return ret;
}

上面函式判斷是否有日誌功能,有日誌功能的話就啟動jbd2開始記錄日誌,且為commit狀態。日誌就是記錄後設資料和資料的寫順序,資料完成後為什麼狀態,然後刪除日誌記錄。如果資料沒有寫完斷電情況下重啟時會把日誌檔案回放進行資料的恢復。

回寫快取資料

重新整理系統中所以檔案系統檔案的快取資料,找出page cache中的資料地址i_mapping。

void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
{
	struct inode *inode, *old_inode = NULL;

	spin_lock(&inode_sb_list_lock);
	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
		struct address_space *mapping = inode->i_mapping;

		spin_lock(&inode->i_lock);
		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
		    mapping->nrpages == 0) {
			spin_unlock(&inode->i_lock);
			continue;
		}
		__iget(inode);
		spin_unlock(&inode->i_lock);
		spin_unlock(&inode_sb_list_lock);
		/*
		 * We hold a reference to 'inode' so it couldn't have been
		 * removed from s_inodes list while we dropped the
		 * inode_sb_list_lock.  We cannot iput the inode now as we can
		 * be holding the last reference and we cannot iput it under
		 * inode_sb_list_lock. So we keep the reference and iput it
		 * later.
		 */
		iput(old_inode);
		old_inode = inode;

		func(I_BDEV(inode), arg);

		spin_lock(&inode_sb_list_lock);
	}
	spin_unlock(&inode_sb_list_lock);
	iput(old_inode);
}
static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
{
	filemap_fdatawrite(bdev->bd_inode->i_mapping);
}
int filemap_fdatawrite(struct address_space *mapping)
{
	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
				loff_t end, int sync_mode)
{
	int ret;
	struct writeback_control wbc = {
		.sync_mode = sync_mode,
		.nr_to_write = LONG_MAX,
		.range_start = start,
		.range_end = end,
	};

	if (!mapping_cap_writeback_dirty(mapping))
		return 0;

	ret = do_writepages(mapping, &wbc);
	return ret;
}
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
	int ret;

	if (wbc->nr_to_write <= 0)
		return 0;
	if (mapping->a_ops->writepages)
		ret = mapping->a_ops->writepages(mapping, wbc);
	else
		ret = generic_writepages(mapping, wbc);
	return ret;
}

ext4_da_writepages 會先呼叫,後面又會進入到這裡呼叫generic_writepages函式,最終是呼叫__writepage函式。

int generic_writepages(struct address_space *mapping,
		       struct writeback_control *wbc)
{
	struct blk_plug plug;
	int ret;

	/* deal with chardevs and other special file */
	if (!mapping->a_ops->writepage)
		return 0;

	blk_start_plug(&plug);
	ret = write_cache_pages(mapping, wbc, __writepage, mapping);
	blk_finish_plug(&plug);
	return ret;
}

分析write_cache_pages函式,

int write_cache_pages(struct address_space *mapping,
		      struct writeback_control *wbc, writepage_t writepage,
		      void *data)
{
	while (!done && (index <= end)) {
		int i;

		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
		if (nr_pages == 0)
			break;

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			ret = (*writepage)(page, wbc, data);

最後呼叫writepage函式,.writepage        = ext4_writepage,

static int __writepage(struct page *page, struct writeback_control *wbc,
		       void *data)
{
	struct address_space *mapping = data;
	int ret = mapping->a_ops->writepage(page, wbc);
	mapping_set_error(mapping, ret);
	return ret;
}

寫髒頁到磁碟

ext4_writepage
	ext4_bio_write_page
		io_submit_add_bh
			io_submit_init
				bio->bi_end_io = ext4_end_bio
	ext4_io_submit
		submit_bio		//提交一個bio到塊裝置層,可看到總page個數io_bio+=(count/8);
			generic_make_request
				q->make_request_fn(q, bio) //blk_queue_bio
					get_request
					init_request_from_bio
					plug = current->plug;
					__blk_run_queue
						q->request_fn(q) == scsi_request_fn	提交request到SCSI層,提交的page個數io_in+=(scsi_bufflen(cmd)/4096);
							scsi_init_cmd_errh
							scsi_dispatch_cmd
								cmd->scsi_done = scsi_done;
								host->hostt->queuecommand(host, cmd);
ext4_da_writepages
	mpage_da_map_and_submit
		mpage_da_submit_io
			ext4_io_submit
				submit_bio
					generic_make_request
						q->make_request_fn(q, bio);
							blk_queue_make_request(q, blk_queue_bio);
							q->make_request_fn = mfn;
								blk_queue_bio
									blk_queue_bounce
									get_request
									init_request_from_bio
									__blk_run_queue(q);
										__blk_run_queue_uncond(q);
											q->request_fn(q);== scsi_request_fn

上面就把fs這一層分析完成,接下來開始看block層的submit_bio函式。

等待快取資料

static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
{
	filemap_fdatawait(bdev->bd_inode->i_mapping);
}
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
			    loff_t end_byte)
{
	pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
	pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
	struct pagevec pvec;
	int nr_pages;
	int ret2, ret = 0;

	if (end_byte < start_byte)
		goto out;
	printk(KERN_ERR "wait,in\n");
	pagevec_init(&pvec, 0);
	while ((index <= end) &&
			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
			PAGECACHE_TAG_WRITEBACK,
			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
		unsigned i;

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			/* until radix tree lookup accepts end_index */
			if (page->index > end)
				continue;

			wait_on_page_writeback(page); 進入D狀態等待回寫完成
			if (TestClearPageError(page))
				ret = -EIO;
		}
		pagevec_release(&pvec);
		cond_resched();
	}
out:
	ret2 = filemap_check_errors(mapping);
	if (!ret)
		ret = ret2;
	printk(KERN_ERR "wait,out\n");

	return ret;
}
static inline void wait_on_page_writeback(struct page *page)
{
	if (PageWriteback(page))
		wait_on_page_bit(page, PG_writeback);
}
void wait_on_page_bit(struct page *page, int bit_nr)
{
	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);

	if (test_bit(bit_nr, &page->flags))
		__wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
							TASK_UNINTERRUPTIBLE);
}

上面函式進入D狀態。

 

相關文章