ceph叢集手動部署(後期新增mon和osd)

時空無限發表於2020-10-30

環境

作業系統

cat /etc/redhat-release 
CentOS Linux release 7.2.1511 (Core) 

核心

uname -a
Linux host-192-168-235-123 3.10.0-327.el7.x86_64 #1 SMP Thu Nov 19 22:10:57 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux

環境初始配置

主機名

分別為每臺主機修改主機名

hostnamectl set-hostnme ceph-node1
hostnamectl set-hostnme ceph-node2
hostnamectl set-hostnme ceph-node3

在ceph-node1節點修改主機名解析,複製到另外兩個節點

cat /etc/hosts
127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
::1         localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.235.123 ceph-node1
192.168.235.137 ceph-node2
192.168.235.142 ceph-node3

scp /etc/hosts ceph-node2:/etc/
scp /etc/hosts ceph-node3:/etc/

防火牆

3臺節點都操作

systemctl disable firewalld
systemctl stop firewalld

selinux
3臺節點都操作

sed -i 's#SELINUX=enforcing#SELINUX=disabled#g' /etc/selinux/config 
setenforce 0

安裝

安裝部分在所有節點操作

安裝時鐘服務

yum -y install chrony
systemctl enable chronyd
systemctl start chronyd

安裝epel源

yum -y install epel-release

將epel.repo中baseurl前的註釋取消

cat /etc/yum.repos.d/epel.repo 
[epel]
name=Extra Packages for Enterprise Linux 7 - $basearch
baseurl=http://download.fedoraproject.org/pub/epel/7/$basearch
metalink=https://mirrors.fedoraproject.org/metalink?repo=epel-7&arch=$basearch
failovermethod=priority
enabled=1
gpgcheck=1
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7

[epel-debuginfo]
name=Extra Packages for Enterprise Linux 7 - $basearch - Debug
baseurl=http://download.fedoraproject.org/pub/epel/7/$basearch/debug
metalink=https://mirrors.fedoraproject.org/metalink?repo=epel-debug-7&arch=$basearch
failovermethod=priority
enabled=0
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7
gpgcheck=1

[epel-source]
name=Extra Packages for Enterprise Linux 7 - $basearch - Source
baseurl=http://download.fedoraproject.org/pub/epel/7/SRPMS
metalink=https://mirrors.fedoraproject.org/metalink?repo=epel-source-7&arch=$basearch
failovermethod=priority
enabled=0
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7
gpgcheck=1

安裝ceph的第三方二進位制檔案

yum -y install snappy leveldb gdisk python-argparse gperftools-libs

配置阿里的ceph源

cat >/etc/yum.repos.d/ceph.repo<<eof
[Ceph-SRPMS]
name=Ceph SRPMS packages
baseurl=https://mirrors.aliyun.com/ceph/rpm-jewel/el7/SRPMS/
enabled=1
gpgcheck=0
type=rpm-md
 
[Ceph-aarch64]
name=Ceph aarch64 packages
baseurl=https://mirrors.aliyun.com/ceph/rpm-jewel/el7/aarch64/
enabled=1
gpgcheck=0
type=rpm-md
 
[Ceph-noarch]
name=Ceph noarch packages
baseurl=https://mirrors.aliyun.com/ceph/rpm-jewel/el7/noarch/
enabled=1
gpgcheck=0
type=rpm-md
 
[Ceph-x86_64]
name=Ceph x86_64 packages
baseurl=https://mirrors.aliyun.com/ceph/rpm-jewel/el7/x86_64/
enabled=1
gpgcheck=0
type=rpm-md
eof

安裝ceph

yum install ceph -y

驗證

rpm -qa|egrep -i "ceph|rados|rbd"
librados2-10.2.11-0.el7.x86_64
python-cephfs-10.2.11-0.el7.x86_64
ceph-mds-10.2.11-0.el7.x86_64
ceph-10.2.11-0.el7.x86_64
librbd1-10.2.11-0.el7.x86_64
python-rbd-10.2.11-0.el7.x86_64
libradosstriper1-10.2.11-0.el7.x86_64
ceph-common-10.2.11-0.el7.x86_64
ceph-selinux-10.2.11-0.el7.x86_64
ceph-mon-10.2.11-0.el7.x86_64
ceph-osd-10.2.11-0.el7.x86_64
libcephfs1-10.2.11-0.el7.x86_64
python-rados-10.2.11-0.el7.x86_64
ceph-base-10.2.11-0.el7.x86_64

部署

部署部分先在ceph-node1節點操作

部署monitor

建立ceph配置檔案

touch /etc/ceph/ceph.conf
uuidgen 
6b359d58-e4a0-4751-b8f0-23e90d88d794

使用uuidgen命令的輸出作為配置檔案中的fsid引數

cat  > /etc/ceph/ceph.conf  <<EOF
[global]
fsid=6b359d58-e4a0-4751-b8f0-23e90d88d794
public network = 192.168.235.0/24

osd pool default min size = 1
osd pool default pg num = 128
osd pool default pgp num = 128
osd journal size = 1024

[mon]
mon initial members = ceph-node1
mon host = ceph-node1,ceph-node2,ceph-node3 
mon addr = 192.168.235.123,192.168.235.137,192.168.235.142

[mon.ceph-node1]
host = ceph-node1
mon addr = 192.168.235.123
EOF

為你的叢集建立祕鑰環,並使用如下命令生成monitor的祕鑰

ceph-authtool --create-keyring /tmp/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *'
輸出
creating /tmp/ceph.mon.keyring

檢視生成的key檔案

[root@host-192-168-235-123 ~]# cat /tmp/ceph.mon.keyring 
[mon.]
	key = AQDOYppfR0XgJRAAPwNFbVJc8VqDqtXDLP/0lQ==
	caps mon = "allow *"

建立一個client.admin使用者,並將其新增到祕鑰中

ceph-authtool --create-keyring /etc/ceph/ceph.client.admin.keyring --gen-key -n client.admin --set-uid=0 --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow *'
輸出
creating /etc/ceph/ceph.client.admin.keyring

檢視生成的key檔案

[root@host-192-168-235-123 ~]# cat /etc/ceph/ceph.client.admin.keyring 
[client.admin]
	key = AQCYY5pfFBUzABAANujWJUCZ75qLu/aNoe9WVw==
	auid = 0
	caps mds = "allow *"
	caps mon = "allow *"
	caps osd = "allow *"

將client.admin祕鑰新增到ceph.mon.keyring中

ceph-authtool /tmp/ceph.mon.keyring --import-keyring /etc/ceph/ceph.client.admin.keyring 
輸出
importing contents of /etc/ceph/ceph.client.admin.keyring into /tmp/ceph.mon.keyring

檢視新增後的的key檔案

[root@host-192-168-235-123 ~]# cat /tmp/ceph.mon.keyring 
[mon.]
	key = AQDOYppfR0XgJRAAPwNFbVJc8VqDqtXDLP/0lQ==
	caps mon = "allow *"
[client.admin]
	key = AQCYY5pfFBUzABAANujWJUCZ75qLu/aNoe9WVw==
	auid = 0
	caps mds = "allow *"
	caps mon = "allow *"
	caps osd = "allow *"

生成monitor map

語法

monmaptool --create --add {hostname} {ip-addreee} --fsid {uuid} /tmp/monmap

執行命令

monmaptool --create --add ceph-node1 192.168.235.123 --fsid 6b359d58-e4a0-4751-b8f0-23e90d88d794 /tmp/monmap
輸出
monmaptool: monmap file /tmp/monmap
monmaptool: set fsid to 6b359d58-e4a0-4751-b8f0-23e90d88d794
monmaptool: writing epoch 0 to /tmp/monmap (1 monitors)

如果執行錯了,之心下面的命令重新執行一遍

monmaptool --create --add ceph-node1 192.168.235.123 --fsid 6b359d58-e4a0-4751-b8f0-23e90d88d794 /tmp/monmap --clobber

為monitor建立類似/path/cluster_name-monitor_node格式的目錄

ll /var/lib/ceph/mon/
總用量 0
mkdir /var/lib/ceph/mon/ceph-ceph-node1

填入第一個monitor守護程式資訊

ceph-mon --mkfs -i ceph-node1 --monmap /tmp/monmap --keyring /tmp/ceph.mon.keyring 
輸出
ceph-mon: set fsid to 6b359d58-e4a0-4751-b8f0-23e90d88d794
ceph-mon: created monfs at /var/lib/ceph/mon/ceph-ceph-node1 for mon.ceph-node1

授權

chown -R ceph.ceph /var/lib/ceph/mon/ceph-ceph-node1/

啟動ceph-mon

systemctl start ceph-mon@ceph-node1.service
systemctl start ceph.target

如果報錯用

systemctl reset-failed ceph-mon@ceph-node1.service

然後再次啟動

systemctl start ceph-mon@ceph-node1.service

其實真實啟動命令是這個,當啟動不起來,journal -xe 又看不出什麼報錯時候就用這個命令看報錯

/usr/bin/ceph-mon -f --cluster ceph --id ceph-node1 --setuser ceph --setgroup ceph

驗證
忽略錯誤,因為這個時候osd還沒配置

[root@controller yum.repos.d]# ceph -s
    cluster 6b359d58-e4a0-4751-b8f0-23e90d88d794
     health HEALTH_ERR
            no osds
     monmap e1: 1 mons at {ceph-node1=192.168.140.200:6789/0}
            election epoch 3, quorum 0 ceph-node1
     osdmap e1: 0 osds: 0 up, 0 in
            flags sortbitwise,require_jewel_osds
      pgmap v2: 64 pgs, 1 pools, 0 bytes data, 0 objects
            0 kB used, 0 kB / 0 kB avail
                  64 creating
[root@controller yum.repos.d]# ceph health
HEALTH_ERR no osds

報錯
左後執行ceph -s的時候卡住
原因

/etc/ceph/ceph.conf 配置檔案中配置為
mon initial members = ceph01
mon host = ceph-node1,ceph-node2,ceph-node3 

mon initial members和mon host中的任何一個都不一致

建立osd

檢查系統中可用的磁碟

[root@ceph-node1 mon]# ceph-disk list
/dev/vda :
 /dev/vda2 other, ext4, mounted on /
 /dev/vda1 other, ext4, mounted on /boot
/dev/vdb other, unknown

改分割槽型別

注意:ceph OSD基於GUID分割槽表(GPT)工作,如果OSD沒用GPT分割槽,你應該將分割槽標記從任何其他型別改為GPT。

parted /dev/vdb mklabel GPT

準備OSD磁碟

提供叢集以及檔案系統資訊,用於OSD的磁碟準備
語法

ceph-disk prepare --cluster {cluster-name} --cluster-uuid {fsid} --fs-type {ext4|xfs|btrfs} {data-path} [{journal-path}]
ceph-disk prepare --cluster ceph --cluster-uuid 6b359d58-e4a0-4751-b8f0-23e90d88d794 --fs-type xfs /dev/vdb 
輸出
Creating new GPT entries.
Setting name!
partNum is 1
REALLY setting name!
The operation has completed successfully.
Setting name!
partNum is 0
REALLY setting name!
The operation has completed successfully.
meta-data=/dev/vdb1              isize=2048   agcount=4, agsize=1245119 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=0        finobt=0
data     =                       bsize=4096   blocks=4980475, imaxpct=25
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0 ftype=0
log      =internal log           bsize=4096   blocks=2560, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0
Warning: The kernel is still using the old partition table.
The new table will be used at the next reboot.
The operation has completed successfully.

檢視磁碟分割槽型別

[root@ceph-node1 mon]# fdisk -l

磁碟 /dev/vda:21.5 GB, 21474836480 位元組,41943040 個扇區
Units = 扇區 of 1 * 512 = 512 bytes
扇區大小(邏輯/物理):512 位元組 / 512 位元組
I/O 大小(最小/最佳):512 位元組 / 512 位元組
磁碟標籤型別:dos
磁碟識別符號:0x00027617

   裝置 Boot      Start         End      Blocks   Id  System
/dev/vda1   *        2048      411647      204800   83  Linux
/dev/vda2          411648    41943039    20765696   83  Linux
WARNING: fdisk GPT support is currently new, and therefore in an experimental phase. Use at your own discretion.

磁碟 /dev/vdb:21.5 GB, 21474836480 位元組,41943040 個扇區
Units = 扇區 of 1 * 512 = 512 bytes
扇區大小(邏輯/物理):512 位元組 / 512 位元組
I/O 大小(最小/最佳):512 位元組 / 512 位元組
磁碟標籤型別:gpt


#         Start          End    Size  Type            Name
 1      2099200     41943006     19G  未知          ceph data
 2         2048      2099199      1G  未知          ceph journal

說明:磁碟標籤型別處 為dos的為MBR分割槽,為gpt的為GPT分割槽。
準備好磁碟之後,可以發現,ceph工具自動將這個磁碟的分割槽劃分成data和journal(日誌)兩部分,其中vdb1是data,vdb2是journal。

啟用OSD

ceph-disk activate /dev/vdb1
輸出
got monmap epoch 1
added key for osd.0
Created symlink from /run/systemd/system/ceph-osd.target.wants/ceph-osd@0.service to /usr/lib/systemd/system/ceph-osd@.service.

檢視狀態

[root@ceph-node1 mon]# ceph -s
    cluster 6b359d58-e4a0-4751-b8f0-23e90d88d794
     health HEALTH_WARN
            64 pgs degraded
            64 pgs stuck degraded
            64 pgs stuck unclean
            64 pgs stuck undersized
            64 pgs undersized
     monmap e1: 1 mons at {ceph-node1=192.168.235.123:6789/0}
            election epoch 4, quorum 0 ceph-node1
     osdmap e5: 1 osds: 1 up, 1 in
            flags sortbitwise,require_jewel_osds
      pgmap v11: 64 pgs, 1 pools, 0 bytes data, 0 objects
            107 MB used, 19337 MB / 19444 MB avail
                  64 active+undersized+degraded

說明:這是由於osd僅僅只有一個的原因,不必在意。

複製配置到ceph-node2和ceph-node3

這可以讓ceph-node2和ceph-node3發出叢集管理命令

scp /etc/ceph/ceph* ceph-node2:/etc/ceph/
scp /etc/ceph/ceph* ceph-node3:/etc/ceph/

擴充套件叢集

新增monitor

登入ceh-node2並建立目錄

mkdir -p /var/lib/ceph/mon/ceph-ceph-node2

編輯/etc/ceph/ceph.conf配置檔案,在[mon]部分下增加新的montitor資訊

[mon.ceph-node2]
host = ceph-node2
mon addr = 192.168.235.137:6789

從ceph叢集中提取祕鑰環資訊

[root@ceph-node2 ~]# ceph auth get mon. -o monkeyring
輸出
exported keyring for mon.

檢視祕鑰環資訊

[root@ceph-node2 ~]# cat monkeyring 
[mon.]
	key = AQDOYppfR0XgJRAAPwNFbVJc8VqDqtXDLP/0lQ==
	caps mon = "allow *"

從ceph叢集中獲取monitor map資訊

ceph mon getmap -o monmap
輸出
got monmap epoch 1

使用祕鑰和已知的monmap,構建一個新的montior,即fs

ceph mon -i ceph-node2 --mkfs --monmap monmap --keyring monkeyring
輸出
ceph-mon: set fsid to 6b359d58-e4a0-4751-b8f0-23e90d88d794
ceph-mon: created monfs at /var/lib/ceph/mon/ceph-ceph-node2 for mon.ceph-node2

新增新的mon到叢集中

ceph mon add ceph-node2 192.168.235.137
輸出
port defaulted to 6789; adding mon.ceph-node2 at 192.168.235.137:6789/0

授權

chown -R ceph.ceph /var/lib/ceph/mon/ceph-ceph-node2/

啟動程式

systemctl start ceph-mon@ceph-node2.service

新增OSD

檢視系統中可用的磁碟

ceph-disk list
輸出
/dev/vda :
 /dev/vda2 other, ext4, mounted on /
 /dev/vda1 other, ext4, mounted on /boot
/dev/vdb other, unknown

改分割槽型別

parted /dev/vdb mklabel GPT

準備OSD磁碟

ceph-disk prepare --cluster ceph --cluster-uuid 6b359d58-e4a0-4751-b8f0-23e90d88d794 --fs-type xfs /dev/vdb 
輸出
Setting name!
partNum is 1
REALLY setting name!
The operation has completed successfully.
Setting name!
partNum is 0
REALLY setting name!
The operation has completed successfully.
meta-data=/dev/vdb1              isize=2048   agcount=4, agsize=1245119 blks
         =                       sectsz=512   attr=2, projid32bit=1
         =                       crc=0        finobt=0
data     =                       bsize=4096   blocks=4980475, imaxpct=25
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0 ftype=0
log      =internal log           bsize=4096   blocks=2560, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=1
realtime =none                   extsz=4096   blocks=0, rtextents=0
Warning: The kernel is still using the old partition table.
The new table will be used at the next reboot.
The operation has completed successfully.

啟用OSD

ceph-disk activate /dev/vdb1
Removed symlink /run/systemd/system/ceph-osd.target.wants/ceph-osd@1.service.
Created symlink from /run/systemd/system/ceph-osd.target.wants/ceph-osd@1.service to /usr/lib/systemd/system/ceph-osd@.service.
ceph -s
    cluster 6b359d58-e4a0-4751-b8f0-23e90d88d794
     health HEALTH_WARN
            clock skew detected on mon.ceph-node2, mon.ceph-node3
            Monitor clock skew detected 
     monmap e3: 3 mons at {ceph-node1=192.168.235.123:6789/0,ceph-node2=192.168.235.137:6789/0,ceph-node3=192.168.235.142:6789/0}
            election epoch 16, quorum 0,1,2 ceph-node1,ceph-node2,ceph-node3
     osdmap e27: 3 osds: 3 up, 3 in
            flags sortbitwise,require_jewel_osds
      pgmap v78: 64 pgs, 1 pools, 0 bytes data, 0 objects
            323 MB used, 58011 MB / 58334 MB avail
                  64 active+clean

clock skew detected on mon.ceph-node2, mon.ceph-node3
這個是因為沒有做時間同步,安裝並配置時間同步伺服器即可。

相關文章