http://www.oracleblog.org/working-case/ohasd-can-not-auto-start-with-server-reboot/
在Oracle Linux Server release 6.2下安裝11.2.0.1的grid,安裝的時候,順利完成(可能最後執行root.sh的時候有報錯,但是沒注意到),等安裝完重啟主機,發現crs起不來。因此,後續的安裝RAC也無法正常進行下去。
重啟主機,一開始還能看到crs和ohas的程式,但是一會crs的程式就消失了,ohasd的程式也只有一個。
一開始看到有crs程式:
[root@ol6-112-rac1 ~]# ps -ef |grep crs
root 1983 1 0 10:39 ? 00:00:00 /u01/app/11.2.0.3/grid/bin/crsctl.bin start has
root 2185 2153 1 10:40 pts/0 00:00:00 grep crs
[root@ol6-112-rac1 ~]#
如果strace crs的程式可以看到:
……
nanosleep({5, 0}, 0xbfb2a9b0) = 0
open("/u01/app/11.2.0.3/grid/crs/mesg/crsus.msb", O_RDONLY) = 3
fcntl64(3, F_SETFD, FD_CLOEXEC) = 0
lseek(3, 0, SEEK_SET) = 0
read(3, "\25\23\"\1\23\3\t\t\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 256) = 256
lseek(3, 512, SEEK_SET) = 512
read(3, "\21'\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 1024, SEEK_SET) = 1024
read(3, "\226\0\245\0\262\0\307\0\330\0\345\0\364\0\376\0\7\1\21\0013\1;\1D\1\231\1\303\1\370\1"..., 512) = 512
lseek(3, 75264, SEEK_SET) = 75264
read(3, "\3\0\32\20\3\0\32\0\33\20\0\0\30\1\34\20\0\0K\1\0\0\0\0|\1 crsc"..., 512) = 512
lseek(3, 115712, SEEK_SET) = 115712
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116224, SEEK_SET) = 116224
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116736, SEEK_SET) = 116736
read(3, "\0\0\0\0\0\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
close(3) = 0
fstat64(1, {st_mode=S_IFCHR|0600, st_rdev=makedev(5, 1), ...}) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B38400 -opost -isig -icanon -echo ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb55d6000
write(1, "CRS-4124: Oracle High Availabili"..., 60) = 60
open("/u01/app/11.2.0.3/grid/crs/mesg/crsus.msb", O_RDONLY) = 3
fcntl64(3, F_SETFD, FD_CLOEXEC) = 0
lseek(3, 0, SEEK_SET) = 0
read(3, "\25\23\"\1\23\3\t\t\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 256) = 256
lseek(3, 512, SEEK_SET) = 512
read(3, "\21'\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 1024, SEEK_SET) = 1024
read(3, "\226\0\245\0\262\0\307\0\330\0\345\0\364\0\376\0\7\1\21\0013\1;\1D\1\231\1\303\1\370\1"..., 512) = 512
lseek(3, 52224, SEEK_SET) = 52224
read(3, "\10\0\353\n\1\0008\0\354\n\1\0\212\0\355\n\3\0\320\0\356\n\0\0\24\1\240\17\0\0q\1"..., 512) = 512
lseek(3, 115712, SEEK_SET) = 115712
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116224, SEEK_SET) = 116224
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116736, SEEK_SET) = 116736
read(3, "\0\0\0\0\0\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
close(3) = 0
write(1, "CRS-4000: Command Start failed, "..., 58) = 58
munmap(0xb55d7000, 35744) = 0
futex(0xa05d2f4, FUTEX_CMP_REQUEUE_PRIVATE, 1, 2147483647, 0xa05d1d0, 4) = 1
futex(0xa05d1d0, FUTEX_WAKE_PRIVATE, 1) = 1
munmap(0xb4d51000, 4198400) = 0
munmap(0xb5152000, 143360) = 0
munmap(0xb5175000, 143360) = 0
munmap(0xb5198000, 143360) = 0
exit_group(1) = ?
Process 1983 detached
[root@ol6-112-rac2 ~]#
後續,程式就消失了。
[root@ol6-112-rac1 ~]# ps -ef |grep crs
root 2248 2153 1 10:42 pts/0 00:00:00 grep crs
[root@ol6-112-rac1 ~]#
如果strace ohasd程式,可以看到:
[oracle@ol6-112-rac1 cfgtoollogs]$ ps -ef |grep ohasd
root 2111 1 0 09:14 ? 00:00:00 /u01/app/11.2.0.3/grid/bin/ohasd.bin reboot
root 2369 2155 0 09:31 pts/0 00:00:00 ./ohasd.bin start
oracle 2400 2313 0 09:34 pts/1 00:00:00 grep ohasd
[oracle@ol6-112-rac1 cfgtoollogs]$ strace -p 2369
attach: ptrace(PTRACE_ATTACH, ...): Operation not permitted
[oracle@ol6-112-rac1 cfgtoollogs]$ exit
logout
[root@ol6-112-rac1 ~]#
[root@ol6-112-rac1 ~]#
[root@ol6-112-rac1 ~]# strace -p 2369
Process 2369 attached - interrupt to quit
open("/var/tmp/.oracle/npohasd", O_WRONLY) = ? ERESTARTSYS (To be restarted)
我們知道,11g GI的啟動順序是(參考 ID 1050908.1):
ohasd-->ohasd
agents-->daemons (gipcd, mdnsd, gpnpd,ctssd, ocssd, crsd, evmd asm
etc),然後 crsd-->crsd agents--> user resources (database, SCAN,
listener etc)
我們現在看到情況是連ohasd都沒有啟動,沒看到/etc/init.d/init.ohasd run這樣的程式。
正常情況下,應該有:
ps -ef|grep init.ohasd|grep -v grep
root 2279 1 0 18:14 ? 00:00:00 /bin/sh /etc/init.d/init.ohasd run
ok,我們嘗試手工啟動/etc/init.d/init.ohasd run:
root@ol6-112-rac1 bin]# /etc/init.d/init.ohasd run
mkfifo: cannot create fifo `/var/tmp/.oracle/npohasd': File exists
^C
[root@ol6-112-rac1 bin]#
看到是/var/tmp/.oracle/npohasd已經被佔據,我們移除一下:
[root@ol6-112-rac1 .oracle]# mv npohasd npohasd.bak
[root@ol6-112-rac1 .oracle]#
然後就能順利啟動了。——看來,如果手動啟動ohasd,還是能起來的,但是為什麼不能自動的啟動?
檢查inittab也確實是存在的:
cat /etc/inittab|grep init.ohasd
h1:35:respawn:/etc/init.d/init.ohasd run >/dev/null 2>&1 dev/null
根據[ID 1050908.1],裡面有一句話:
Note: Oracle Linux 6 (OL6) or Red Hat Linux 6 (RHEL6) has deprecated inittab, rather, init.ohasd will be configured
in upstart in /etc/init, however, the process ""/etc/init.d/init.ohasd run" should still be up.
If any rc Snncommand script (located in rcn.d, example S98gcstartup) stuck, init process may not start
"/etc/init.d/init.ohasd run"; please engage OS vendor to find out why relevant Snncommand script stuck.
查了一下,果然linux 6和11.2.0.1果然沒經過certified,經過認證的是11.2.0.3.
在linux 6下讓11.2.0.1的ohasd隨機啟動,需要特別處理一下(如果還在安裝過程中,直接在執行root.sh前,執行從第2步開始的步驟):
1. 先將root.sh執行的內容回滾:
[root@ol6-112-rac1 install]# ./roothas.pl -deconfig -force -verbose
2013-06-06 14:53:07: Checking for super user privileges
2013-06-06 14:53:07: User has super user privileges
2013-06-06 14:53:07: Parsing the host name
Using configuration parameter file: ./crsconfig_params
CRS resources for listeners are still configured
CRS-2613: Could not find resource 'ora.cssd'.
CRS-4000: Command Stop failed, or completed with errors.
CRS-2613: Could not find resource 'ora.cssd'.
CRS-4000: Command Delete failed, or completed with errors.
CRS-2791: Starting shutdown of Oracle High Availability Services-managed resources on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.crsd' on 'ol6-112-rac1'
CRS-2790: Starting shutdown of Cluster Ready Services-managed resources on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.LISTENER_SCAN3.lsnr' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.DATA.dg' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.LISTENER_SCAN2.lsnr' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.ol6-112-rac2.vip' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.LISTENER_SCAN1.lsnr' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.ol6-112-rac1.vip' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.ol6-112-rac2.vip' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.ol6-112-rac1.vip' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.LISTENER_SCAN3.lsnr' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.scan3.vip' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.scan3.vip' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.LISTENER_SCAN2.lsnr' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.scan2.vip' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.scan2.vip' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.LISTENER_SCAN1.lsnr' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.scan1.vip' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.scan1.vip' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.DATA.dg' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.asm' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.asm' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.eons' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.ons' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.ons' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.net1.network' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.net1.network' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.eons' on 'ol6-112-rac1' succeeded
CRS-2792: Shutdown of Cluster Ready Services-managed resources on 'ol6-112-rac1' has completed
CRS-2677: Stop of 'ora.crsd' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.gpnpd' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.cssdmonitor' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.ctssd' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.evmd' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.asm' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.mdnsd' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.cssdmonitor' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.gpnpd' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.evmd' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.mdnsd' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.ctssd' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.asm' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.cssd' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.cssd' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.diskmon' on 'ol6-112-rac1'
CRS-2673: Attempting to stop 'ora.gipcd' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.gipcd' on 'ol6-112-rac1' succeeded
CRS-2677: Stop of 'ora.diskmon' on 'ol6-112-rac1' succeeded
CRS-2793: Shutdown of Oracle High Availability Services-managed resources on 'ol6-112-rac1' has completed
CRS-4133: Oracle High Availability Services has been stopped.
ADVM/ACFS is not supported on oraclelinux-release-6Server-2.0.2.i686
ACFS-9201: Not Supported
Successfully deconfigured Oracle Restart stack
[root@ol6-112-rac1 install]#
2. 修改$GI_HOME/install/s_crsconfig_lib.pm,在裡面的# Start OHASD前加上如下的程式碼:
vi /u01/app/11.2.0.3/grid/crs/install/s_crsconfig_lib.pm
……
sleep (5);
s_add_itab () or return $FAILED;
system ("$INIT q");
## == add by oracleblog.org .Start here ==
my $UPSTART_OHASD_SERVICE = "oracle-ohasd";
my $INITCTL = "/sbin/initctl";
($status, @output) = system_cmd_capture ("$INITCTL start $UPSTART_OHASD_SERVICE");
if (0 != $status)
{
error ("Failed to start $UPSTART_OHASD_SERVICE, error: $!");
return $FAILED;
}
## == add by oracleblog.org .End here ==
# Start OHASD
$status = system ("$CRSCTL start has");
……
3. 建立/etc/init/oracle-ohasd.conf檔案,檔案內容如下:
# Oracle OHASD startup
start on runlevel [35]
stop on runlevel [!35]
respawn
exec /etc/init.d/init.ohasd run >/dev/null 2>&1 dev/null
4. 再次執行root.sh
[root@ol6-112-rac1 grid]# sh root.sh
Running Oracle 11g root.sh script...
The following environment variables are set as:
ORACLE_OWNER= oracle
ORACLE_HOME= /u01/app/11.2.0.3/grid
Enter the full pathname of the local bin directory: [/usr/local/bin]:
The file "dbhome" already exists in /usr/local/bin. Overwrite it? (y/n)
[n]: y
Copying dbhome to /usr/local/bin ...
The file "oraenv" already exists in /usr/local/bin. Overwrite it? (y/n)
[n]: y
Copying oraenv to /usr/local/bin ...
The file "coraenv" already exists in /usr/local/bin. Overwrite it? (y/n)
[n]: y
Copying coraenv to /usr/local/bin ...
Entries will be added to the /etc/oratab file as needed by
Database Configuration Assistant when a database is created
Finished running generic part of root.sh script.
Now product-specific root actions will be performed.
2013-06-06 15:04:23: Parsing the host name
2013-06-06 15:04:23: Checking for super user privileges
2013-06-06 15:04:23: User has super user privileges
Using configuration parameter file: /u01/app/11.2.0.3/grid/crs/install/crsconfig_params
LOCAL ADD MODE
Creating OCR keys for user 'root', privgrp 'root'..
Operation successful.
Adding daemon to inittab
CRS-4123: Oracle High Availability Services has been started.
ohasd is starting
ADVM/ACFS is not supported on oraclelinux-release-6Server-2.0.2.i686
CRS-2672: Attempting to start 'ora.gipcd' on 'ol6-112-rac1'
CRS-2672: Attempting to start 'ora.mdnsd' on 'ol6-112-rac1'
CRS-2676: Start of 'ora.gipcd' on 'ol6-112-rac1' succeeded
CRS-2676: Start of 'ora.mdnsd' on 'ol6-112-rac1' succeeded
CRS-2672: Attempting to start 'ora.gpnpd' on 'ol6-112-rac1'
CRS-2676: Start of 'ora.gpnpd' on 'ol6-112-rac1' succeeded
CRS-2672: Attempting to start 'ora.cssdmonitor' on 'ol6-112-rac1'
CRS-2676: Start of 'ora.cssdmonitor' on 'ol6-112-rac1' succeeded
CRS-2672: Attempting to start 'ora.cssd' on 'ol6-112-rac1'
CRS-2672: Attempting to start 'ora.diskmon' on 'ol6-112-rac1'
CRS-2676: Start of 'ora.diskmon' on 'ol6-112-rac1' succeeded
CRS-2676: Start of 'ora.cssd' on 'ol6-112-rac1' succeeded
CRS-2500: Cannot stop resource 'ora.crsd' as it is not running
CRS-4000: Command Stop failed, or completed with errors.
Command return code of 1 (256) from command: /u01/app/11.2.0.3/grid/bin/crsctl stop resource ora.crsd -init
Stop of resource "ora.crsd -init" failed
Failed to stop CRSD
CRS-2500: Cannot stop resource 'ora.asm' as it is not running
CRS-4000: Command Stop failed, or completed with errors.
Command return code of 1 (256) from command: /u01/app/11.2.0.3/grid/bin/crsctl stop resource ora.asm -init
Stop of resource "ora.asm -init" failed
Failed to stop ASM
CRS-2500: Cannot stop resource 'ora.ctssd' as it is not running
CRS-4000: Command Stop failed, or completed with errors.
Command return code of 1 (256) from command: /u01/app/11.2.0.3/grid/bin/crsctl stop resource ora.ctssd -init
Stop of resource "ora.ctssd -init" failed
Failed to stop OCTSSD
CRS-2673: Attempting to stop 'ora.cssdmonitor' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.cssdmonitor' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.cssd' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.cssd' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.gpnpd' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.gpnpd' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.gipcd' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.gipcd' on 'ol6-112-rac1' succeeded
CRS-2673: Attempting to stop 'ora.mdnsd' on 'ol6-112-rac1'
CRS-2677: Stop of 'ora.mdnsd' on 'ol6-112-rac1' succeeded
Initial cluster configuration failed. See /u01/app/11.2.0.3/grid/cfgtoollogs/crsconfig/rootcrs_ol6-112-rac1.log for details
[root@ol6-112-rac1 grid]#
重啟後,你發現ohasd已經可以隨機啟動了。