赞
踩
实验环境raspi 4b
See also drivers/md/md-faulty.c and “every_nth” module option for scsi_debug.
Available fault injection capabilities
failslab
injects slab allocation failures. (kmalloc(), kmem_cache_alloc(), …)
fail_page_alloc
injects page allocation failures. (alloc_pages(), get_free_pages(), …)
fail_usercopy
injects failures in user memory access functions. (copy_from_user(), get_user(), …)
fail_futex
injects futex deadlock and uaddr fault errors.
fail_sunrpc
injects kernel RPC client and server failures.
fail_make_request
injects disk IO errors on devices permitted by setting
/sys/block//make-it-fail or
/sys/block///make-it-fail. (submit_bio_noacct())
fail_mmc_request
injects MMC data errors on devices permitted by setting
debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request
fail_function
injects error return on specific functions, which are marked by
ALLOW_ERROR_INJECTION() macro, by setting debugfs entries
under /sys/kernel/debug/fail_function. No boot option supported.
NVMe fault injection
inject NVMe status code and retry flag on devices permitted by setting
debugfs entries under /sys/kernel/debug/nvme*/fault_inject. The default
status code is NVME_SC_INVALID_OPCODE with no retry. The status code and
retry flag can be set via the debugfs.
看看系统中有没有这个文件,从结果来看,并没有,有什么特殊的编译选项需要开启?
root@runninglinuxkernel:~# ls -alh /sys/block/vda lrwxrwxrwx 1 root root 0 Mar 13 23:59 /sys/block/vda -> ../devices/pci0000:00/0000:00:03.0/virtio0/block/vda root@runninglinuxkernel:~# ls -alh /sys/block/vda/ total 0 drwxr-xr-x 8 root root 0 Mar 13 23:59 . drwxr-xr-x 3 root root 0 Mar 13 23:59 .. -r--r--r-- 1 root root 4.0K Mar 14 00:02 alignment_offset lrwxrwxrwx 1 root root 0 Mar 14 00:02 bdi -> ../../../../../virtual/bdi/254:0 -rw-r--r-- 1 root root 4.0K Mar 14 00:02 cache_type -r--r--r-- 1 root root 4.0K Mar 14 00:02 capability -r--r--r-- 1 root root 4.0K Mar 14 00:02 dev lrwxrwxrwx 1 root root 0 Mar 14 00:02 device -> ../../../virtio0 -r--r--r-- 1 root root 4.0K Mar 14 00:02 discard_alignment -r--r--r-- 1 root root 4.0K Mar 14 00:02 ext_range -r--r--r-- 1 root root 4.0K Mar 14 00:02 hidden drwxr-xr-x 2 root root 0 Mar 14 00:02 holders -r--r--r-- 1 root root 4.0K Mar 14 00:02 inflight drwxr-xr-x 3 root root 0 Mar 13 23:59 mq drwxr-xr-x 2 root root 0 Mar 14 00:02 power drwxr-xr-x 3 root root 0 Mar 14 00:02 queue -r--r--r-- 1 root root 4.0K Mar 14 00:02 range -r--r--r-- 1 root root 4.0K Mar 14 00:02 removable -r--r--r-- 1 root root 4.0K Mar 14 00:02 ro -r--r--r-- 1 root root 4.0K Mar 13 23:59 serial -r--r--r-- 1 root root 4.0K Mar 14 00:02 size drwxr-xr-x 2 root root 0 Mar 13 23:59 slaves -r--r--r-- 1 root root 4.0K Mar 14 00:02 stat lrwxrwxrwx 1 root root 0 Mar 13 23:59 subsystem -> ../../../../../../class/block drwxr-xr-x 2 root root 0 Mar 14 00:02 trace -rw-r--r-- 1 root root 4.0K Mar 13 23:59 uevent
那就查查配置文件中FAULT_INJECT相关的字段。
rlk@rlk:runninglinuxkernel_5.0$ grep "FAULT_INJECT" * -r
arch/riscv/configs/busybox_defconfig:# CONFIG_FAULT_INJECTION is not set
arch/riscv/configs/debian_defconfig:# CONFIG_FAULT_INJECTION is not set
arch/x86/configs/debian_defconfig:# CONFIG_FAULT_INJECTION is not set
arch/arm64/configs/busybox_defconfig:# CONFIG_F2FS_FAULT_INJECTION is not set
arch/arm64/configs/busybox_defconfig:# CONFIG_FAULT_INJECTION is not set
arch/arm64/configs/debian_default_defconfig:# CONFIG_DRBD_FAULT_INJECTION is not set
arch/arm64/configs/debian_default_defconfig:# CONFIG_F2FS_FAULT_INJECTION is not set
arch/arm64/configs/debian_default_defconfig:# CONFIG_NFSD_FAULT_INJECTION is not set
arch/arm64/configs/debian_default_defconfig:# CONFIG_FAULT_INJECTION is not set
arch/arm64/configs/debian_defconfig:# CONFIG_F2FS_FAULT_INJECTION is not set
arch/arm64/configs/debian_defconfig:# CONFIG_FAULT_INJECTION is not set
CONFIG_FUNCTION_ERROR_INJECTION=y
CONFIG_FAULT_INJECTION=y
CONFIG_FAILSLAB=y
CONFIG_FAIL_PAGE_ALLOC=y
CONFIG_FAIL_MAKE_REQUEST=y
CONFIG_FAIL_IO_TIMEOUT=y
CONFIG_FAIL_FUTEX=y
CONFIG_FAULT_INJECTION_DEBUG_FS=y
有可能在.config
文件中找不到对应的配置选项,需要在menuconfig
中输入/
查找,然后将对应的选项打开。
make KERNEL=kernel8 ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- Image modules dtbs -j24
curtis@raspberrypi:~ $ ll /sys/class/block/sda/ total 0 -r--r--r-- 1 root root 4096 Jun 29 15:16 alignment_offset lrwxrwxrwx 1 root root 0 Jun 29 15:16 bdi -> ../../../../../../../../../../../../../../virtual/bdi/8:0 -r--r--r-- 1 root root 4096 Jun 29 15:16 capability -r--r--r-- 1 root root 4096 Jun 29 14:17 dev lrwxrwxrwx 1 root root 0 Jun 29 15:16 device -> ../../../0:0:0:0 -r--r--r-- 1 root root 4096 Jun 29 15:16 discard_alignment -r--r--r-- 1 root root 4096 Jun 29 15:16 diskseq -r--r--r-- 1 root root 4096 Jun 29 15:16 events -r--r--r-- 1 root root 4096 Jun 29 15:16 events_async -rw-r--r-- 1 root root 4096 Jun 29 15:16 events_poll_msecs -r--r--r-- 1 root root 4096 Jun 29 15:16 ext_range -r--r--r-- 1 root root 4096 Jun 29 15:16 hidden drwxr-xr-x 2 root root 0 Jun 29 15:16 holders -r--r--r-- 1 root root 4096 Jun 29 15:16 inflight drwxr-xr-x 2 root root 0 Jun 29 15:16 integrity -rw-r--r-- 1 root root 4096 Jun 29 15:16 io-timeout-fail -rw-r--r-- 1 root root 4096 Jun 29 15:16 make-it-fail drwxr-xr-x 3 root root 0 Jun 29 15:16 mq drwxr-xr-x 2 root root 0 Jun 29 15:16 power drwxr-xr-x 3 root root 0 Jun 29 14:17 queue -r--r--r-- 1 root root 4096 Jun 29 15:16 range -r--r--r-- 1 root root 4096 Jun 29 14:17 removable -r--r--r-- 1 root root 4096 Jun 29 14:24 ro drwxr-xr-x 5 root root 0 Jun 29 14:17 sda1 drwxr-xr-x 5 root root 0 Dec 22 2022 sda2 -r--r--r-- 1 root root 4096 Jun 29 14:24 size drwxr-xr-x 2 root root 0 Jun 29 14:17 slaves -r--r--r-- 1 root root 4096 Jun 29 14:24 stat lrwxrwxrwx 1 root root 0 Jun 29 14:17 subsystem -> ../../../../../../../../../../../../../../../class/block drwxr-xr-x 2 root root 0 Jun 29 15:16 trace -rw-r--r-- 1 root root 4096 Dec 22 2022 uevent
已经具备故障注入的能力,应该怎么使用??
root@raspberrypi:/home/curtis# ll /sys/kernel/debug/fail
fail_function/ fail_futex/ fail_io_timeout/ fail_make_request/ fail_page_alloc/ failslab/ fail_usercopy/
root@raspberrypi:/home/curtis# ll /sys/kernel/debug/fail_make_request/
interval reject-start space times verbose_ratelimit_interval_ms
probability require-end stacktrace-depth verbose
reject-end require-start task-filter verbose_ratelimit_burst
以上这些是具体的故障的控制节点。
fault-inject-debugfs内核模块提供一些debugfs的入口以便在运行时配置fault-injection能力。
/sys/kernel/debug/fail*/probability:
故障注入的可能性,以百分比为单位。
单位:percent(百分比)
需要特别注意的是,对于某个测试用例而言,当probability=100时,可以通过设置/sys/kernel/debug/fail*/interval
来控制错误的间隔时间。
/sys/kernel/debug/fail*/interval:
指定两个故障之间的间隔时间,内核通过调用should_fail()函数来使得故障注入不生效。
注意:如果使能该选项,需要设定interval>1(单位是什么??),设置该选项一般是在probability=100时使用。
/sys/kernel/debug/fail*/times:
指定故障产生的最大次数,如果设置为-1,意味着没有上限,将会一直出错。
/sys/kernel/debug/fail*/space:
设置异常的size余量,每次执行到故障注入点后,都会将在该space的基础上递减size值,直到该值降低为0后才会注入异常。其中size的含义对各种异常各不相同,对于IO异常表示的是本次IO的字节数,对于内存分配表示的是内存的大小。默认值为0。
/sys/kernel/debug/fail*/verbose
Format: { 0 | 1 | 2 }
指定故障注入时相关信息的详细程度,0意味着没有任何信息,1每个failure只会打印一行调试信息,2将会打印函数调用trace - 在对故障注入能力做调试使用。(从实际的使用结果来看,默认值为2)
/sys/kernel/debug/fail*/task-filter:
Format: { ‘Y’ | ‘N’ }
设置进程过滤,N表示不过滤,Y表示对启用了make-it-fail的进程和在中断上下文的流程进行过滤(通过/proc//make-it-fail=1进行设置),不触发故障注入。默认值为N。
/sys/kernel/debug/fail*/require-start,
/sys/kernel/debug/fail*/require-end,
/sys/kernel/debug/fail*/reject-start,
/sys/kernel/debug/fail*/reject-end:
设置调用流程的虚拟地址空间过滤。若调用流程设计的代码段(Text段)包含在require-start -> require-end且不包含在reject-start -> reject-end中才注入异常,可以用来设置故障注入只针对某个或某些模块执行。默认required范围为[0, ULONG_MAX)(即整个虚拟地址空间),rejected范围为[0, 0)。
/sys/kernel/debug/fail*/stacktrace-depth:
设置[require-start, require-end) 和[reject-start, reject-end)跟踪的调用深度。默认值为32。
/sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem:
格式:{ ‘Y’ | ‘N’ }
设置页分配的高端内存过滤,设置为Y后当分配的内存类型包含__GFP_HIGHMEM(高端内存)不启用故障注入。默认值为N。
/sys/kernel/debug/failslab/ignore-gfp-wait:
/sys/kernel/debug/fail_page_alloc/ignore-gfp-wait:
格式:{ ‘Y’ | ‘N’ }
设置内存分配的分配模式过滤,设置为Y后只对非睡眠的内存分配启用故障注入(GFP_ATOMIC)。默认值为N。
/sys/kernel/debug/fail_page_alloc/min-order:
设置页分配order的过滤限制,当内核分配页小于该设定值则不进行故障注入。默认值为1
/sys/kernel/debug/fail_futex/ignore-private:
格式:{ ‘Y’ | ‘N’ }
默认为“N”,将其设置为“Y”将禁用故障注入在处理私有(地址空间)futexes 时。
/sys/kernel/debug/fail_sunrpc/ignore-client-disconnect:
格式:{ ‘Y’ | ‘N’ }
默认为“N”,将其设置为“Y”将禁用断开连接在 RPC 客户端上注入。
/sys/kernel/debug/fail_sunrpc/ignore-server-disconnect:
格式:{ ‘Y’ | ‘N’ }
默认为“N”,将其设置为“Y”将禁用断开连接在 RPC 服务器上注入。
/sys/kernel/debug/fail_sunrpc/ignore-cache-wait:
格式:{ ‘Y’ | ‘N’ }
默认为“N”,将其设置为“Y”将禁用缓存等待在 RPC 服务器上注入。
/sys/kernel/debug/fail_function/inject:
格式:{ ‘函数名’ | ‘!函数名’ | ‘’ }
通过名称指定错误注入的目标函数,如果函数名称前导 ‘!’ 前缀,给定的函数是从注入列表中删除。如果没有指定 (‘’),注入列表被清除。
/sys/kernel/debug/fail_function/injectable:
(只读)显示错误注入函数和什么类型可以指定错误值。错误类型将是以下之一
以下;
/sys/kernel/debug/fail_function/<函数名>/retval:
指定要注入给定函数的“错误”返回值,这将在用户指定新的注入条目时创建。
请注意,此文件仅接受无符号值。所以,如果你想使用负 errno,你最好使用 ‘printf’ 而不是 ‘echo’,例如:
$ printf %#x -12 > retval
前文中提到的debugfs接口只在debugfs启用后在有效,对于在内核启动阶段或没有设置debugfs配置选项的情况,Fault-injection的默认配置值通过启动参数进行传递,包括以下:
fail_page_alloc=
fail_make_request=
fail_futex=
mmc_core.fail_request=<interval>,<probability>,<space>,<times>
通过启动参数传入的参数有限,目前只能接受interval、probability、space和times这4个参数(其他参数会被内核设置为默认的值),但是在一般情况下也够用了。
例如:如果想在内核启动阶段就启用failslab 100%无限故障注入,则可以传入内核启动参数:
failslab=1,100,0,-1
过程条目
^^^^^^^^^^^^
/proc//fail-nth,
/proc/self/task//fail-nth:
向该文件写入整数 N 会使任务中的第 N 次调用失败。
读取此文件返回一个整数值。“0”值表示
注入了先前写入此文件的故障设置。
正整数 N 表示故障尚未注入。
请注意,此文件启用所有类型的故障(slab、futex 等)。
此设置优先于所有其他通用 debugfs 设置
例如概率、间隔、时间等。但是按能力设置
(例如 fail_futex/ignore-private)优先于它。
此功能旨在对单个故障进行系统测试
系统调用。请参见下面的示例。
-定义故障属性
DECLARE_FAULT_ATTR(名称);
请看fault-inject.h中struct fault_attr的定义了解详情。
-提供一种配置故障属性的方法
-引导选项
如果您需要从启动时启用故障注入功能,您可以
提供引导选项来配置它。它有一个辅助函数:
setup_fault_attr(attr, str);
debugfs 条目
faillab、fail_page_alloc、fail_usercopy 和 fail_make_request 使用这种方式。
辅助功能:
fault_create_debugfs_attr(名称,父级,属性);
-模块参数
如果故障注入能力的范围限制在单内核模块,最好提供模块参数给配置故障属性。
-添加一个钩子来插入失败
在 should_fail() 返回 true 时,客户端代码应该注入一个失败:
应该失败(属性,大小);
#!/bin/bash FAILTYPE=failslab echo Y > /sys/kernel/debug/$FAILTYPE/task-filter echo 10 > /sys/kernel/debug/$FAILTYPE/probability echo 100 > /sys/kernel/debug/$FAILTYPE/interval echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 2 > /sys/kernel/debug/$FAILTYPE/verbose echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait faulty_system() { bash -c "echo 1 > /proc/self/make-it-fail && exec $*" } if [ $# -eq 0 ] then echo "Usage: $0 modulename [ modulename ... ]" exit 1 fi for m in $* do echo inserting $m... faulty_system modprobe $m echo removing $m... faulty_system modprobe -r $m done
#!/bin/bash FAILTYPE=fail_page_alloc module=$1 if [ -z $module ] then echo "Usage: $0 <modulename>" exit 1 fi modprobe $module if [ ! -d /sys/module/$module/sections ] then echo Module $module is not loaded exit 1 fi cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end echo N > /sys/kernel/debug/$FAILTYPE/task-filter echo 10 > /sys/kernel/debug/$FAILTYPE/probability echo 100 > /sys/kernel/debug/$FAILTYPE/interval echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 2 > /sys/kernel/debug/$FAILTYPE/verbose echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT echo "Injecting errors into the module $module... (interrupt to stop)" sleep 1000000
#!/bin/bash rm -f testfile.img dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 DEVICE=$(losetup --show -f testfile.img) mkfs.btrfs -f $DEVICE mkdir -p tmpmnt FAILTYPE=fail_function FAILFUNC=open_ctree echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject printf %#x -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval echo N > /sys/kernel/debug/$FAILTYPE/task-filter echo 100 > /sys/kernel/debug/$FAILTYPE/probability echo 0 > /sys/kernel/debug/$FAILTYPE/interval echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 1 > /sys/kernel/debug/$FAILTYPE/verbose mount -t btrfs $DEVICE tmpmnt if [ $? -ne 0 ] then echo "SUCCESS!" else echo "FAILED!" umount tmpmnt fi echo > /sys/kernel/debug/$FAILTYPE/inject rmdir tmpmnt losetup -d $DEVICE rm testfile.img
先配置/sys/kernel/debug/fail_io_timeout故障参数。
# 设置故障出现的可能性为100%
root@raspberrypi:/sys/kernel/debug/fail_io_timeout# echo 100 > probability
# 没有错误次数上限
root@raspberrypi:/sys/kernel/debug/fail_io_timeout# echo -1 > times
# 设置两次故障的间隔时间为 10ms
root@raspberrypi:/sys/kernel/debug/fail_io_timeout# echo 10 > interval
使能故障
root@raspberrypi:/sys/kernel/debug/fail_io_timeout# echo 1 > /sys/block/sdb/io-timeout-fail # 使用dd命令触发I/O流程 root@runninglinuxkernel:~# dd if=/dev/sdb of=./test.img bs=1M count=100 oflag=direct [ 1430.673613] FAULT_INJECTION: forcing a failure. name fail_io_timeout, interval 10, probability 100, space 0, times -1 [ 1430.673660] CPU: 2 PID: 107 Comm: usb-storage Tainted: G C 6.1.35-v8 #3 [ 1430.673676] Hardware name: Raspberry Pi 4 Model B Rev 1.5 (DT) [ 1430.673685] Call trace: [ 1430.673691] dump_backtrace+0xfc/0x108 [ 1430.673711] show_stack+0x20/0x30 [ 1430.673722] dump_stack_lvl+0x8c/0xb8 [ 1430.673740] dump_stack+0x18/0x34 [ 1430.673753] should_fail_ex+0x1e4/0x238 [ 1430.673770] should_fail+0x14/0x20 [ 1430.673784] __blk_should_fake_timeout+0x24/0x30 [ 1430.673804] scsi_done_internal+0x13c/0x160 [ 1430.673822] scsi_done_direct+0x1c/0x28 [ 1430.673837] usb_stor_control_thread+0x274/0x2b0 [ 1430.673855] kthread+0x100/0x118 [ 1430.673871] ret_from_fork+0x10/0x20 [ 1461.268398] VFS: busy inodes on changed media sdb [ 1461.269517] sd 1:0:0:0: [sdb] 61440000 512-byte logical blocks: (31.5 GB/29.3 GiB) [ 1461.290253] FAULT_INJECTION: forcing a failure. name fail_io_timeout, interval 10, probability 100, space 0, times -1 <-- 设置的故障参数 [ 1461.290298] CPU: 0 PID: 107 Comm: usb-storage Tainted: G C 6.1.35-v8 #3 [ 1461.290315] Hardware name: Raspberry Pi 4 Model B Rev 1.5 (DT) [ 1461.290325] Call trace: <-- 函数调用栈 [ 1461.290331] dump_backtrace+0xfc/0x108 [ 1461.290352] show_stack+0x20/0x30 [ 1461.290364] dump_stack_lvl+0x8c/0xb8 [ 1461.290381] dump_stack+0x18/0x34 [ 1461.290393] should_fail_ex+0x1e4/0x238 [ 1461.290411] should_fail+0x14/0x20 [ 1461.290425] __blk_should_fake_timeout+0x24/0x30 [ 1461.290445] scsi_done_internal+0x13c/0x160 [ 1461.290462] scsi_done_direct+0x1c/0x28 [ 1461.290477] usb_stor_control_thread+0x274/0x2b0 [ 1461.290496] kthread+0x100/0x118 [ 1461.290511] ret_from_fork+0x10/0x20 [ 1491.982600] sd 1:0:0:0: [sdb] tag#0 UNKNOWN(0x2003) Result: hostbyte=0x03 driverbyte=DRIVER_OK cmd_age=30s [ 1491.982642] sd 1:0:0:0: [sdb] tag#0 CDB: opcode=0x28 28 00 03 a9 7f f9 00 00 01 00 [ 1491.982661] I/O error, dev sdb, sector 61439993 op 0x0:(READ) flags 0x80700 phys_seg 1 prio class 2 [ 1491.986407] FAULT_INJECTION: forcing a failure. name fail_io_timeout, interval 10, probability 100, space 0, times -1 [ 1491.986456] CPU: 0 PID: 107 Comm: usb-storage Tainted: G C 6.1.35-v8 #3 [ 1491.986475] Hardware name: Raspberry Pi 4 Model B Rev 1.5 (DT) [ 1491.986485] Call trace: [ 1491.986491] dump_backtrace+0xfc/0x108 [ 1491.986513] show_stack+0x20/0x30 [ 1491.986524] dump_stack_lvl+0x8c/0xb8 [ 1491.986542] dump_stack+0x18/0x34 [ 1491.986555] should_fail_ex+0x1e4/0x238 [ 1491.986574] should_fail+0x14/0x20 [ 1491.986588] __blk_should_fake_timeout+0x24/0x30 [ 1491.986609] scsi_done_internal+0x13c/0x160 [ 1491.986626] scsi_done_direct+0x1c/0x28 [ 1491.986641] usb_stor_control_thread+0x274/0x2b0 [ 1491.986661] kthread+0x100/0x118 [ 1491.986676] ret_from_fork+0x10/0x20 # 数据拷贝进程进入D状态 curtis@raspberrypi:~ $ ps aux | grep dd root 4149 0.0 0.0 6016 2508 pts/0 D+ 00:38 0:00 dd if=/dev/sdb of=./test.img bs=1M count=100 iflag=direct curtis 4325 0.0 0.0 6044 636 pts/1 S+ 00:40 0:00 grep --color=auto dd curtis@raspberrypi:~ $
In order to make it easier to accomplish the tasks mentioned above, we can use tools/testing/fault-injection/failcmd.sh. Please run a command “./tools/testing/fault-injection/failcmd.sh --help” for more information and see the following examples.
#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # # NAME # failcmd.sh - run a command with injecting slab/page allocation failures # # SYNOPSIS # failcmd.sh --help # failcmd.sh [<options>] command [arguments] # # DESCRIPTION # Run command with injecting slab/page allocation failures by fault # injection. # # NOTE: you need to run this script as root. # usage() { cat >&2 <<EOF Usage: $0 [options] command [arguments] OPTIONS -p percent --probability=percent likelihood of failure injection, in percent. Default value is 1 -t value --times=value specifies how many times failures may happen at most. Default value is 1 --oom-kill-allocating-task=value set /proc/sys/vm/oom_kill_allocating_task to specified value before running the command. Default value is 1 -h, --help Display a usage message and exit --interval=value, --space=value, --verbose=value, --task-filter=value, --stacktrace-depth=value, --require-start=value, --require-end=value, --reject-start=value, --reject-end=value, --ignore-gfp-wait=value See Documentation/fault-injection/fault-injection.rst for more information failslab options: --cache-filter=value fail_page_alloc options: --ignore-gfp-highmem=value, --min-order=value ENVIRONMENT FAILCMD_TYPE The following values for FAILCMD_TYPE are recognized: failslab inject slab allocation failures fail_page_alloc inject page allocation failures If FAILCMD_TYPE is not defined, then failslab is used. EOF } if [ $UID != 0 ]; then echo must be run as root >&2 exit 1 fi DEBUGFS=`mount -t debugfs | head -1 | awk '{ print $3}'` if [ ! -d "$DEBUGFS" ]; then echo debugfs is not mounted >&2 exit 1 fi FAILCMD_TYPE=${FAILCMD_TYPE:-failslab} FAULTATTR=$DEBUGFS/$FAILCMD_TYPE if [ ! -d $FAULTATTR ]; then echo $FAILCMD_TYPE is not available >&2 exit 1 fi LONGOPTS=probability:,interval:,times:,space:,verbose:,task-filter: LONGOPTS=$LONGOPTS,stacktrace-depth:,require-start:,require-end: LONGOPTS=$LONGOPTS,reject-start:,reject-end:,oom-kill-allocating-task:,help if [ $FAILCMD_TYPE = failslab ]; then LONGOPTS=$LONGOPTS,ignore-gfp-wait:,cache-filter: elif [ $FAILCMD_TYPE = fail_page_alloc ]; then LONGOPTS=$LONGOPTS,ignore-gfp-wait:,ignore-gfp-highmem:,min-order: fi TEMP=`getopt -o p:i:t:s:v:h --long $LONGOPTS -n 'failcmd.sh' -- "$@"` if [ $? != 0 ]; then usage exit 1 fi eval set -- "$TEMP" fault_attr_default() { echo N > $FAULTATTR/task-filter echo 0 > $FAULTATTR/probability echo 1 > $FAULTATTR/times } fault_attr_default oom_kill_allocating_task_saved=`cat /proc/sys/vm/oom_kill_allocating_task` restore_values() { fault_attr_default echo $oom_kill_allocating_task_saved \ > /proc/sys/vm/oom_kill_allocating_task } # # Default options # declare -i oom_kill_allocating_task=1 declare task_filter=Y declare -i probability=1 declare -i times=1 while true; do case "$1" in -p|--probability) probability=$2 shift 2 ;; -i|--interval) echo $2 > $FAULTATTR/interval shift 2 ;; -t|--times) times=$2 shift 2 ;; -s|--space) echo $2 > $FAULTATTR/space shift 2 ;; -v|--verbose) echo $2 > $FAULTATTR/verbose shift 2 ;; --task-filter) task_filter=$2 shift 2 ;; --stacktrace-depth) echo $2 > $FAULTATTR/stacktrace-depth shift 2 ;; --require-start) echo $2 > $FAULTATTR/require-start shift 2 ;; --require-end) echo $2 > $FAULTATTR/require-end shift 2 ;; --reject-start) echo $2 > $FAULTATTR/reject-start shift 2 ;; --reject-end) echo $2 > $FAULTATTR/reject-end shift 2 ;; --oom-kill-allocating-task) oom_kill_allocating_task=$2 shift 2 ;; --ignore-gfp-wait) echo $2 > $FAULTATTR/ignore-gfp-wait shift 2 ;; --cache-filter) echo $2 > $FAULTATTR/cache_filter shift 2 ;; --ignore-gfp-highmem) echo $2 > $FAULTATTR/ignore-gfp-highmem shift 2 ;; --min-order) echo $2 > $FAULTATTR/min-order shift 2 ;; -h|--help) usage exit 0 shift ;; --) shift break ;; esac done [ -z "$1" ] && exit 0 echo $oom_kill_allocating_task > /proc/sys/vm/oom_kill_allocating_task echo $task_filter > $FAULTATTR/task-filter echo $probability > $FAULTATTR/probability echo $times > $FAULTATTR/times trap "restore_values" SIGINT SIGTERM EXIT cmd="echo 1 > /proc/self/make-it-fail && exec $@" bash -c "$cmd"
Examples:
Run a command “make -C tools/testing/selftests/ run_tests” with injecting slab
allocation failure::
# ./tools/testing/fault-injection/failcmd.sh \
-- make -C tools/testing/selftests/ run_tests
Same as above except to specify 100 times failures at most instead of one time
at most by default::
# ./tools/testing/fault-injection/failcmd.sh --times=100 \
-- make -C tools/testing/selftests/ run_tests
Same as above except to inject page allocation failure instead of slab
allocation failure::
# env FAILCMD_TYPE=fail_page_alloc \
./tools/testing/fault-injection/failcmd.sh --times=100 \
-- make -C tools/testing/selftests/ run_tests
The following code systematically faults 0-th, 1-st, 2-nd and so on capabilities in the socketpair() system call::
#include <sys/types.h> #include <sys/stat.h> #include <sys/socket.h> #include <sys/syscall.h> #include <fcntl.h> #include <unistd.h> #include <string.h> #include <stdlib.h> #include <stdio.h> #include <errno.h> int main() { int i, err, res, fail_nth, fds[2]; char buf[128]; system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait"); sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid)); fail_nth = open(buf, O_RDWR); for (i = 1;; i++) { sprintf(buf, "%d", i); write(fail_nth, buf, strlen(buf)); res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); err = errno; pread(fail_nth, buf, sizeof(buf), 0); if (res == 0) { close(fds[0]); close(fds[1]); } printf("%d-th fault %c: res=%d/%d\n", i, atoi(buf) ? 'N' : 'Y', res, err); if (atoi(buf)) break; } return 0; }
An example output::
1-th fault Y: res=-1/23 2-th fault Y: res=-1/23 3-th fault Y: res=-1/12 4-th fault Y: res=-1/12 5-th fault Y: res=-1/23 6-th fault Y: res=-1/23 7-th fault Y: res=-1/23 8-th fault Y: res=-1/12 9-th fault Y: res=-1/12 10-th fault Y: res=-1/12 11-th fault Y: res=-1/12 12-th fault Y: res=-1/12 13-th fault Y: res=-1/12 14-th fault Y: res=-1/12 15-th fault Y: res=-1/12 16-th fault N: res=0/12
后续使用的时候不断完善。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。