問題描述
硬碟透過RAID(raid驅動為megaraid)方式掛載至系統。 修改塊裝置sdb的預讀引數(read_ahead_kb),使用dd命令向塊裝置寫資料,預讀引數發生了變化。
問題總結
預讀引數改變的原因是由於systemd-udevd服務對sd塊裝置做了IO監聽,一旦發生寫操作,就會觸發on_inotify處理函式重新獲取分割槽資訊,重新獲取分割槽資訊時會重置預讀引數(sd_revalidate_disk函式中重置了預讀引數)。修復sd_revalidate_disk函式修改預讀引數的規則,可以修復該問題。
問題分析
客戶反饋:UOS 1032正常、UOS1040異常、麒麟25.2異常,但三者raid驅動版本一致。
根據客戶反饋加上預讀引數本身與raid驅動無明顯關聯,排除raid驅動問題。 當前驗證思路,先確認預讀引數是由誰(核心模組、上層應用)修改的。
上層應用 手動修改預讀引數後,去掉預讀引數寫許可權chmod 444 ,預讀引數發生變化,排除上層應用主動修改預讀引數。 核心模組 定位預讀引數read_ahead_kb的位置
static struct queue_sysfs_entry queue_ra_entry = {
.attr = {.name = "read_ahead_kb", .mode = 0644 },
.show = queue_ra_show,
.store = queue_ra_store,
};
static ssize_t
queue_ra_store(struct request_queue *q, const char *page, size_t count)
{
unsigned long ra_kb;
ssize_t ret = queue_var_store(&ra_kb, page, count);
if (ret < 0)
return ret;
q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
return ret;
}
預讀引數儲存在q->backing_dev_info->ra_pages中,向相關位置新增除錯資訊,定位改動原因。
透過除錯資訊可知sd_revalidate_disk函式修改了預讀引數。
static int sd_revalidate_disk(struct gendisk *disk)
{
... ...
q->backing_dev_info->ra_pages = max_t(unsigned long, VM_MAX_READAHEAD,
ra_kb) * 1024 / PAGE_SIZE;
set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity));
sd_config_write_same(sdkp);
kfree(buffer);
out:
return 0;
}
檢視堆疊資訊發現sd_revalidate_disk是透過系統呼叫執行的(el0_svc為使用者態系統呼叫的入口)
Dec 8 13:13:32 localhost kernel: [ 352.426379] CPU: 79 PID: 1739 Comm: systemd-udevd Not tainted 4.19.90-25.2.v2101.gfb012.ky10.aarch64 #1
Dec 8 13:13:32 localhost kernel: [ 352.436164] Hardware name: Unisyue Technologies Co., Ltd. UNIS Server R3810 G5/RS41M2C9S, BIOS KL4.1.60 12/02/2021
Dec 8 13:13:32 localhost kernel: [ 352.446896] Call trace:
Dec 8 13:13:32 localhost kernel: [ 352.449777] dump_backtrace+0x0/0x170
Dec 8 13:13:32 localhost kernel: [ 352.453856] show_stack+0x24/0x30
Dec 8 13:13:32 localhost kernel: [ 352.457598] dump_stack+0xa4/0xe8
Dec 8 13:13:32 localhost kernel: [ 352.461338] sd_revalidate_disk+0x3a4/0x1300
Dec 8 13:13:32 localhost kernel: [ 352.466026] rescan_partitions+0xac/0x3b8
Dec 8 13:13:32 localhost kernel: [ 352.470451] __blkdev_reread_part+0x60/0x88
Dec 8 13:13:32 localhost kernel: [ 352.475048] blkdev_reread_part+0x2c/0x48
Dec 8 13:13:32 localhost kernel: [ 352.479472] blkdev_ioctl+0x498/0xb88
Dec 8 13:13:32 localhost kernel: [ 352.483556] block_ioctl+0x50/0x68
Dec 8 13:13:32 localhost kernel: [ 352.487376] do_vfs_ioctl+0xb0/0x898
Dec 8 13:13:32 localhost kernel: [ 352.491366] ksys_ioctl+0x8c/0xa0
Dec 8 13:13:32 localhost kernel: [ 352.495099] __arm64_sys_ioctl+0x28/0x98
Dec 8 13:13:32 localhost kernel: [ 352.499439] el0_svc_common+0x78/0x130
Dec 8 13:13:32 localhost kernel: [ 352.503603] el0_svc_handler+0x38/0x78
Dec 8 13:13:32 localhost kernel: [ 352.507768] el0_svc+0x8/0x1b0
Comm顯示具體應用為systemd-udevd
查閱systemd-udevd原始碼,發現systemd-udev對sd塊裝置做了IO監聽,一旦發生寫操作,就會觸發on_inotify處理函式重新獲取分割槽資訊。 BLKRRPART:重新讀取分割槽表 on_inotify函式呼叫鏈為:on_inotify-->synthesize_change-->ioctl(fd, BLKRRPART, 0)
udevd.c:
static int synthesize_change(sd_device *dev) {
... ...
if (streq_ptr("block", subsystem) &&
streq_ptr("disk", devtype) &&
!startswith(sysname, "dm-")) {
_cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
bool part_table_read = false, has_partitions = false;
sd_device *d;
int fd;
fd = open(devname, O_RDONLY|O_CLOEXEC|O_NOFOLLOW|O_NONBLOCK);
if (fd >= 0) {
r = flock(fd, LOCK_EX|LOCK_NB);
if (r >= 0)
r = ioctl(fd, BLKRRPART, 0);
close(fd);
if (r >= 0)
part_table_read = true;
}
... ...
}
static int on_inotify(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
... ...
FOREACH_INOTIFY_EVENT(e, buffer, l) {
_cleanup_(sd_device_unrefp) sd_device *dev = NULL;
const char *devnode;
if (udev_watch_lookup(e->wd, &dev) <= 0)
continue;
if (sd_device_get_devname(dev, &devnode) < 0)
continue;
log_device_debug(dev, "Inotify event: %x for %s", e->mask, devnode);
if (e->mask & IN_CLOSE_WRITE)
synthesize_change(dev);
else if (e->mask & IN_IGNORED)
udev_watch_end(dev);
}
return 1;
}
static int main_loop(Manager *manager) {
... ...
r = sd_event_add_io(manager->event, &manager->inotify_event, manager->fd_inotify, EPOLLIN, on_inotify, manager);
if (r < 0)
return log_error_errno(r, "Failed to create inotify event source: %m");
... ...
}
繼續跟蹤sd_revalidate_disk的歷史提交記錄發現,ra_pages的修改是在64cf457219acf8e3524530af064784f5677682fe版本中提交的,目的是採用硬碟VPD資訊的OPTIMAL TRANSFER LENGTH(最優傳輸長度)來調整read_ahead_kb。
導致問題的補丁
對sd_revalidate_disk函式進行歷史追溯,發現修改預讀引數的功能是在64cf457219acf8e3524530af064784f5677682fe中提交的
From 64cf457219acf8e3524530af064784f5677682fe Mon Sep 17 00:00:00 2001
From: huhai <huhai@kylinos.cn>
Date: Tue, 3 Mar 2020 16:17:42 +0800
Subject: [PATCH] KYLIN: block/sd: incrase read_ahead_kb for FC-SAN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
使用硬碟VPD資訊的OPTIMAL TRANSFER LENGTH(最優傳輸長度)來調整read_ahead_kb
Signed-off-by: huhai <huhai@kylinos.cn>
Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
---
drivers/scsi/sd.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index f9d02f638c43..88958c7e5330 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3130,7 +3130,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
struct request_queue *q = sdkp->disk->queue;
sector_t old_capacity = sdkp->capacity;
unsigned char *buffer;
- unsigned int dev_max, rw_max;
+ unsigned int dev_max, rw_max, ra_kb;
SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
"sd_revalidate_disk\n"));
@@ -3199,9 +3199,12 @@ static int sd_revalidate_disk(struct gendisk *disk)
if (sd_validate_opt_xfer_size(sdkp, dev_max)) {
q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks);
rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks);
- } else
+ ra_kb = sdkp->opt_xfer_blocks;
+ } else {
rw_max = min_not_zero(logical_to_sectors(sdp, dev_max),
(sector_t)BLK_DEF_MAX_SECTORS);
+ ra_kb = VM_MAX_READAHEAD;
+ }
/* Do not exceed controller limit */
rw_max = min(rw_max, queue_max_hw_sectors(q));
@@ -3217,6 +3220,8 @@ static int sd_revalidate_disk(struct gendisk *disk)
sdkp->first_scan = 0;
+ q->backing_dev_info->ra_pages = max_t(unsigned long, VM_MAX_READAHEAD,
+ ra_kb) * 1024 / PAGE_SIZE;
set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity));
sd_config_write_same(sdkp);
kfree(buffer);
--
2.23.0
解決方案
修改核心程式碼,硬碟初始化,首次進行掃描時配置最佳預讀引數,之後重讀分割槽資訊時不再修改預讀引數。
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 7914f304255d..8e9d9d3065df 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3235,10 +3235,11 @@ static int sd_revalidate_disk(struct gendisk *disk)
q->limits.max_sectors > q->limits.max_hw_sectors)
q->limits.max_sectors = rw_max;
- sdkp->first_scan = 0;
+ if (sdkp->first_scan)
+ q->backing_dev_info->ra_pages = max_t(unsigned long, VM_MAX_READAHEAD,
+ ra_kb) * 1024 / PAGE_SIZE;
- q->backing_dev_info->ra_pages = max_t(unsigned long, VM_MAX_READAHEAD,
- ra_kb) * 1024 / PAGE_SIZE;
+ sdkp->first_scan = 0;
set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity));
sd_config_write_same(sdkp);
kfree(buffer);
--
2.23.0