Monday, April 13, 2015

Reset (unbind and bind) a HBA without rebooting Linux system

One of path of SAN LUN is in 'failed faulty running' state. I reset HBA and it has brought path in active state.


>>Below shows that one path of LUN was failed.

# multipath -ll
mpath999 (3600a0b80000b5c9c0000044d3b667c19) dm-0 STORAGE,VV
size=200G features='1 queue_if_no_path' hwhandler='0' wp=rw
`-+- policy='round-robin 0' prio=1 status=active
  |- 4:0:0:0 sdb 8:16 failed faulty running
  `- 1:0:0:0 sdc 8:32 active ready  running



>> 'fdisk -l' did not list anything on faulty path but works fine on working one.

# fdisk -l /dev/sdb <
#
# fdisk -l /dev/sdc

Disk /dev/sdc: 214.7 GB, 214748364800 bytes
255 heads, 63 sectors/track, 26108 cylinders
Units = cylinders of 16065 * 512 = 8225280 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 16384 bytes / 16777216 bytes
Disk identifier: 0x00000000



>> 'dmsetup status' also shows path is in in F state ( 8:16 F )

# dmsetup status mpath999
0 419430400 multipath 2 0 0 0 1 1 A 0 2 0 8:16 F 1 8:32 A 0



>> Rescanning HBA, restarting multipathd and resetting - none of bring faulty path online.

# for scsi_bus in $(ls /sys/class/scsi_host/); do echo '- - -' > /sys/class/scsi_host/$scsi_bus/scan; done
# /etc/init.d/multipathd restart
# echo '1' >/sys/class/fc_host/host4/issue_lip



>> On this system, 2 FC hosts are online, hence each SAN disk should have 2 paths. Also take note of host4 device ID - 0000:21:00.1

# cd /sys/class/fc_host/
# ls -l
total 0
lrwxrwxrwx 1 root root 0 Jun 21  2014 host1 -> ../../devices/pci0000:00/0000:00:03.0/0000:05:00.0/host1/fc_host/host1
lrwxrwxrwx 1 root root 0 Jun 21  2014 host2 -> ../../devices/pci0000:00/0000:00:03.0/0000:05:00.1/host2/fc_host/host2
lrwxrwxrwx 1 root root 0 Jun 21  2014 host3 -> ../../devices/pci0000:20/0000:20:03.0/0000:21:00.0/host3/fc_host/host3
lrwxrwxrwx 1 root root 0 Jun 21  2014 host4 -> ../../devices/pci0000:20/0000:20:03.0/0000:21:00.1/host4/fc_host/host4
# cat host1/port_state
Online
# cat host?/port_state
Online
Linkdown
Linkdown
Online



>> Check what are devices connected with problematic HBA host

# ls -l /dev/disk/by-path|grep 0000:21:00.1
lrwxrwxrwx 1 root root  9 Apr 13 10:11 pci-0000:21:00.1-fc-0x22220002ac004714-lun-0 -> ../../sdb



>>Force all the paths attached to that HBA to fail, and remove all the device entries from the system

# faulty_hba=0000:21:00.1
# for d in $(ls -l /dev/disk/by-path/| grep $faulty_hba |awk -F'/' '{print $NF}') 
    echo "Dealing with $d" 
    echo "fail path $d" | multipathd -k 
    echo "del path $d" | multipathd -k 
    echo 1 > /sys/block/$d/device/delete 
done 



>> failed path should disappear from multipath

# multipath -ll
mpath999 (3600a0b80000b5c9c0000044d3b667c19) dm-0 STORAGE,VV
size=200G features='1 queue_if_no_path' hwhandler='0' wp=rw
`-+- policy='round-robin 0' prio=1 status=active
  `- 1:0:0:0 sdc 8:32 active ready  running



>> unbind (off line) problematic HBA- take note that host4 ID is 0000:21:00.1 - it will disappear from driver list.

# cd /sys/bus/pci/drivers/qla2xxx/
# ll
total 0
lrwxrwxrwx 1 root root    0 Apr 13 10:09 0000:05:00.0 -> ../../../../devices/pci0000:00/0000:00:03.0/0000:05:00.0
lrwxrwxrwx 1 root root    0 Apr 13 10:09 0000:05:00.1 -> ../../../../devices/pci0000:00/0000:00:03.0/0000:05:00.1
lrwxrwxrwx 1 root root    0 Apr 13 10:09 0000:21:00.0 -> ../../../../devices/pci0000:20/0000:20:03.0/0000:21:00.0
lrwxrwxrwx 1 root root    0 Apr 13 10:09 0000:21:00.1 -> ../../../../devices/pci0000:20/0000:20:03.0/0000:21:00.1
--w------- 1 root root 4096 Apr 13 10:09 bind
lrwxrwxrwx 1 root root    0 Apr 13 10:09 module -> ../../../../module/qla2xxx
--w------- 1 root root 4096 Apr 13 10:09 new_id
--w------- 1 root root 4096 Apr 13 10:09 remove_id
--w------- 1 root root 4096 Jun 21  2014 uevent
--w------- 1 root root 4096 Apr 13 10:09 unbind
# echo '0000:21:00.1' > unbind
# ll
total 0
lrwxrwxrwx 1 root root    0 Apr 13 10:09 0000:05:00.0 -> ../../../../devices/pci0000:00/0000:00:03.0/0000:05:00.0
lrwxrwxrwx 1 root root    0 Apr 13 10:09 0000:05:00.1 -> ../../../../devices/pci0000:00/0000:00:03.0/0000:05:00.1
lrwxrwxrwx 1 root root    0 Apr 13 10:09 0000:21:00.0 -> ../../../../devices/pci0000:20/0000:20:03.0/0000:21:00.0
--w------- 1 root root 4096 Apr 13 10:09 bind
lrwxrwxrwx 1 root root    0 Apr 13 10:09 module -> ../../../../module/qla2xxx
--w------- 1 root root 4096 Apr 13 10:09 new_id
--w------- 1 root root 4096 Apr 13 10:09 remove_id
--w------- 1 root root 4096 Jun 21  2014 uevent
--w------- 1 root root 4096 Apr 13 10:11 unbind


>> bind (on line) host4 HBA again - corresponding entry should be available

# echo '0000:21:00.1' > bind
# ll
total 0
lrwxrwxrwx 1 root root    0 Apr 13 10:09 0000:05:00.0 -> ../../../../devices/pci0000:00/0000:00:03.0/0000:05:00.0
lrwxrwxrwx 1 root root    0 Apr 13 10:09 0000:05:00.1 -> ../../../../devices/pci0000:00/0000:00:03.0/0000:05:00.1
lrwxrwxrwx 1 root root    0 Apr 13 10:09 0000:21:00.0 -> ../../../../devices/pci0000:20/0000:20:03.0/0000:21:00.0
lrwxrwxrwx 1 root root    0 Apr 13 10:11 0000:21:00.1 -> ../../../../devices/pci0000:20/0000:20:03.0/0000:21:00.1
--w------- 1 root root 4096 Apr 13 10:11 bind
lrwxrwxrwx 1 root root    0 Apr 13 10:09 module -> ../../../../module/qla2xxx
--w------- 1 root root 4096 Apr 13 10:09 new_id
--w------- 1 root root 4096 Apr 13 10:09 remove_id
--w------- 1 root root 4096 Jun 21  2014 uevent
--w------- 1 root root 4096 Apr 13 10:11 unbind



>> HBA port may come up with different name - host7 ( instead of host4)

# ll /sys/class/fc_host/
total 0
lrwxrwxrwx 1 root root 0 Jun 21  2014 host1 -> ../../devices/pci0000:00/0000:00:03.0/0000:05:00.0/host1/fc_host/host1
lrwxrwxrwx 1 root root 0 Jun 21  2014 host2 -> ../../devices/pci0000:00/0000:00:03.0/0000:05:00.1/host2/fc_host/host2
lrwxrwxrwx 1 root root 0 Jun 21  2014 host3 -> ../../devices/pci0000:20/0000:20:03.0/0000:21:00.0/host3/fc_host/host3
lrwxrwxrwx 1 root root 0 Apr 13 10:12 host7 -> ../../devices/pci0000:20/0000:20:03.0/0000:21:00.1/host7/fc_host/host7



>> And, all faulty path of LUN should be active and ready (Assuming there is no other problem of FC cables/SAN network, Storage host port etc)!
# multipath -ll
mpath999 (3600a0b80000b5c9c0000044d3b667c19) dm-0 STORAGE,VV
size=200G features='1 queue_if_no_path' hwhandler='0' wp=rw
`-+- policy='round-robin 0' prio=1 status=active
  |- 1:0:0:0 sdc 8:32 active ready running
  `- 7:0:0:0 sdb 8:16 active ready running



Reference: https://access.redhat.com/solutions/287303