赞
踩
本章简要地介绍用户空间应用程序与内核通信或读取内核输出信息的主要机制.
概述:
内核通过各种不同的接口把内部信息输出到用户空间.除了程序员用于请求信息的经典系统调用(system call)外,还有三个特殊的接口,两个是虚拟文件系统,剩下的一个是系统调用.
1)procfs(/pro文件系统)
这是一个虚拟文件系统,通常是挂载在/目录下,/proc.它允许内核以文件的形式向用户空间输出内部信息.这些文件并没有实际存在于磁盘上,但是可以通过cat more 以及shell的 > 将内核文件予以写入用户空间中.
sysctl(/proc/sys目录)
此接口允许用户空间读取或修改内核变量的值.不能够用此接口对每个内核变量进行操作,内核应明确指出哪些变量从这个接口是可见的.
从用户空间你可以用两种方式访问sysctl输出的变量.一种是sysctl系统调用(参见man sysctl)
man sysctl部分描述:
man sysctl
DESCRIPTION
sysctl is used to modify kernel parameters at runtime. The parameters available are those listed under /proc/sys/. Procfs is
required for sysctl support in Linux. You can use sysctl to both read and write sysctl data.
另一种方式是procfs.当内核支持procfs的时候,会在/proc下添加一个特殊目录/proc/sys/,为每一个由sysctl所输出的变量引入一个文件.
2)sysfs(/sys文件系统)
linux设备驱动程序中有对此的很好的解释.
3)ioctl系统调用(输入/输出控制)
ioctl系统调用操作的对象是一个文件,通常是用来实现特殊设备所需但是标准文件系统没有提供的操作.你也可以把socket系统调用返回的套接字描述符传递给ioctl,而这也是网络代码使用ioctl的方式.此接口也由老一代命令所用,如ifconfig route等.
procfs 与 sysctl
procfs和sysctl都输出内核的内部信息,但是procfs主要是输入只读数据,而大多数sysctl信息都是可写入的,但只有超级用户可以写入.
procfs
大多数网络功能在其初始化的时候都会在/proc中注册一个或多个文件.当一个用户读取该文件的时候,会引起内核间接运行一组内核函数,以返回某种输出内容.
/proc中的目录可以使用proc_mkdir创建.
proc_mkdir
Defined as a function in:
fs/proc/generic.c, line 511
511 struct proc_dir_entry *proc_mkdir(const char *name,
512 struct proc_dir_entry *parent)
513 {
514 return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
515 }
Defined as a function prototype in:(prototype:原型)
include/linux/proc_fs.h, line 147
147 extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *);
/proc/net中的文件可以使用定义在include/inux/proc_fs.h中的proc_net_fops_create和proc_net_remove予以注册和除名,这两个函数都是包裹函数,其中含有通用的API:create_proc_entry和remove_proc_entry.特别的,proc_net_fops_create负责创建文档(用proc_net_create),然后初始化其文件操作处理函数.
proc_net_fops_create
Defined as a function in:
include/linux/proc_fs.h, line 177
177 static inline struct proc_dir_entry *proc_net_fops_create(const char *name,
178 mode_t mode, struct file_operations *fops)
179 {
180 struct proc_dir_entry *res = create_proc_entry(name, mode, proc_net);
181
182 if (res)
183 res->proc_fops = fops;
184 return res;
185 }
create_proc_entry
Defined as a function prototype in:
include/linux/proc_fs.h, line 95
95 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
96 struct proc_dir_entry *parent);
Defined as a function in:
517 struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
518 struct proc_dir_entry *parent)
519 {
520 struct proc_dir_entry *ent;
521 nlink_t nlink;
522
523 if (S_ISDIR(mode)) {
524 if ((mode & S_IALLUGO) == 0)
525 mode |= S_IRUGO | S_IXUGO;
526 nlink = 2;
527 } else {
528 if ((mode & S_IFMT) == 0)
529 mode |= S_IFREG;
530 if ((mode & S_IALLUGO) == 0)
531 mode |= S_IRUGO;
532 nlink = 1;
533 }
534
535 ent = proc_create(&parent,name,mode,nlink);
536 if (ent) {
537 if (S_ISDIR(mode)) {
538 ent->proc_fops = &proc_dir_operations;
539 ent->proc_iops = &proc_dir_inode_operations;
540 }
541 if (proc_register(parent, ent) < 0) {
542 kfree(ent);
543 ent = NULL;
544 }
545 }
546 return ent;
547 }
proc_dir_entry
Defined as a struct type in:
include/linux/proc_fs.h, line 53
53 struct proc_dir_entry {
54 unsigned short low_ino;
55 unsigned short namelen;
56 const char *name;
57 mode_t mode;
58 nlink_t nlink;
59 uid_t uid;
60 gid_t gid;
61 unsigned long size;
62 struct inode_operations * proc_iops;
63 struct file_operations * proc_fops;
64 get_info_t *get_info;
65 struct module *owner;
66 struct proc_dir_entry *next, *parent, *subdir;
67 void *data;
68 read_proc_t *read_proc;
69 write_proc_t *write_proc;
70 atomic_t count; /* use count */
71 int deleted; /* delete flag */
72 kdev_t rdev;
73 void *set;
74 };
proc_net_remove
Defined as a function in:
include/linux/proc_fs.h, line 187
187 static inline void proc_net_remove(const char *name)
188 {
189 remove_proc_entry(name,proc_net);
190 }
remove_proc_entry
Defined as a function prototype in:
include/linux/proc_fs.h, line 203
203 static inline void remove_proc_entry(const char *name, struct proc_dir_entry *parent) {};
Defined as a function in:
fs/proc/generic.c, line 565
565 void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
566 {
567 struct proc_dir_entry **p;
568 struct proc_dir_entry *de;
569 const char *fn = name;
570 int len;
571
572 if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
573 goto out;
574 len = strlen(fn);
575 for (p = &parent->subdir; *p; p=&(*p)->next ) {
576 if (!proc_match(len, fn, *p))
577 continue;
578 de = *p;
579 *p = de->next;
580 de->next = NULL;
581 if (S_ISDIR(de->mode))
582 parent->nlink--;
583 clear_bit(de->low_ino - PROC_DYNAMIC_FIRST,
584 proc_alloc_map);
585 proc_kill_inodes(de);
586 de->nlink = 0;
587 if (!atomic_read(&de->count))
588 free_proc_entry(de);
589 else {
590 de->deleted = 1;
591 printk("remove_proc_entry: %s/%s busy, count=%d\n",
592 parent->name, de->name, atomic_read(&de->count));
593 }
594 break;
595 }
596 out:
597 return;
598 }
我们来看一个以ARP协议为注册对象的示例,看看ARP协议是如何在/pro/net/中注册其arp文件的.
arp_seq_fops
Defined as a variable in:
net/ipv4/arp.c, line 1344
1344 static struct file_operations arp_seq_fops = {
1345 .owner = THIS_MODULE,
1346 .open = arp_seq_open,
1347 .read = seq_read,
1348 .llseek = seq_lseek,
1349 .release = seq_release_private,
1350 };
1351 #endif /* CONFIG_PROC_FS */
Referenced (in 3 files total) in:
net/ipv4/arp.c, line 1400
void __init arp_init (void)
1394 {
1395 neigh_table_init(&arp_tbl);
1396
1397 dev_add_pack(&arp_packet_type);
1398
1399 #ifdef CONFIG_PROC_FS
1400 if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops))
1401 panic("unable to create arp proc entry");
1402 #endif
1403 #ifdef CONFIG_SYSCTL
1404 neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4");
1405 #endif
1406 register_netdevice_notifier(&arp_netdev_notifier);
1407 }
sysctl:目录/proc/sys
用户在/proc/sys下看到的一个文件,实际上是一个内核变量.就每个变量而言,内核可以定义:
访问权限
输出到/proc/sys中的变量内容可以借助于相关联的文件进行读写或者直接用sysctl系统调用.
下图是/proc/sys下的内容:
/proc/sys下的有些目录和文件是在引导期间静态定义的,还有些是在运行期间定义的.导致运行期间创建目录或文件的事件示例如下:
当一个新的网络设备被注册或除名时.
/proc/sys中的文件和目录都是以ctl_table结构定义的.ctl_table结构的注册和除名是通过在kernel/sysctl.c中定义的register_sysctl_table和unregister_sysctl_table函数完成.
ctl_table
Defined as a struct type in:
include/linux/sysctl.h, line 813
812 /* A sysctl table is an array of struct ctl_table: */
813 struct ctl_table
814 {
815 int ctl_name; /* Binary ID */
816 const char *procname; /* Text ID for /proc/sys, or zero */
817 void *data;
818 int maxlen;
819 mode_t mode;
820 ctl_table *child;
821 proc_handler *proc_handler; /* Callback for text formatting */
822 ctl_handler *strategy; /* Callback function for all r/w */
823 struct proc_dir_entry *de; /* /proc control block */
824 void *extra1;
825 void *extra2;
826 };
各个关键字段的含义:
const char *procname
在/proc/sys中所用的文件名
int maxlen
输出的内核变量的尺寸大小
mode_t mode
分派给/proc/sys中相关联的文件或目录的访问权限
ctl_table *child
用于建立目录与文件之间的父子关系
proc_handler
当你在一个/proc/sys中读取或是写入一个文件时,完成读取或是写入操作的函数.
strategy
此函数可以选择初始化为在显示或存储之前,完成数据的额外格式化工作的函数.
extra1 extra2
两个可选参数,通常用于定义变量的最小值和最大值.
register_sysctl_table
Defined as a function prototype in:
include/linux/sysctl.h, line 838
Defined as a function in:
kernel/sysctl.c, line 652
652 struct ctl_table_header *register_sysctl_table(ctl_table * table,
653 int insert_at_head)
654 {
655 struct ctl_table_header *tmp;
656 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
657 if (!tmp)
658 return NULL;
659 tmp->ctl_table = table;
660 INIT_LIST_HEAD(&tmp->ctl_entry);
661 tmp->used = 0;
662 tmp->unregistering = NULL;
663 spin_lock(&sysctl_lock);
664 if (insert_at_head)
665 list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
666 else
667 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
668 spin_unlock(&sysctl_lock);
669 #ifdef CONFIG_PROC_FS
670 register_proc_table(table, proc_sys_root, tmp);
671 #endif
672 return tmp;
673 }
unregister_sysctl_table
Defined as a function prototype in:
include/linux/sysctl.h, line 840
Defined as a function in:
kernel/sysctl.c, line 682
675 /**
676 * unregister_sysctl_table - unregister a sysctl table hierarchy
677 * @header: the header returned from register_sysctl_table
678 *
679 * Unregisters the sysctl table and all children. proc entries may not
680 * actually be removed until they are no longer used by anyone.
681 */
682 void unregister_sysctl_table(struct ctl_table_header * header)
683 {
684 spin_lock(&sysctl_lock);
685 start_unregistering(header);
686 #ifdef CONFIG_PROC_FS
687 unregister_proc_table(header->ctl_table, proc_sys_root);
688 #endif
689 spin_unlock(&sysctl_lock);
690 kfree(header);
691 }
来看一个ctl_table的实例化:
Linux/kernel/sysctl.c
1 /*
2 * sysctl.c: General linux system control interface
3 *(通用linux系统控制接口)
4 * Begun 24 March 1995, Stephen Tweedie
5 * Added /proc support, Dec 1995
6 * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
7 * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
8 * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
9 * Dynamic registration fixes, Stephen Tweedie.
10 * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
11 * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
12 * Horn.
13 * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
14 * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
15 * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
16 * Wendling.
17 * The list_for_each() macro wasn't appropriate for the sysctl loop.
18 * Removed it and replaced it with older style, 03/23/00, Bill Wendling
19 */
159 #ifdef CONFIG_NET
160 {CTL_NET, "net", NULL, 0, 0555, net_table},
161 #endif
在/proc/sys中注册文件步骤
我们知道可以分别使用register_sysctl_table和unregister_sysctl_table在/proc/sys中注册或除名文件.函数regiter_sysctl_table需要两个输入参数,
1)指向一个ctl_table实体的指针
2)一个标识,指出新元素应该放在位于相同目录中ctl_table实体列表的何处,头(1)尾(0)
584 * register_sysctl_table - register a sysctl hierarchy
585 * @table: the top-level table structure
586 * @insert_at_head: whether the entry should be inserted in front or at the end
struct ctl_table_header *register_sysctl_table(ctl_table * table,
653 int insert_at_head)
注意:register_sysctl_table的输入并不包括输入参数ctl_table应该添加到/proc/sys文件系统中何处的参考值.原因在于所有的目录插入都是针对/proc/sys目录进行,如果你想把一个文件注册到/proc/sys的子目录,就必须建立一棵树(意味着多个由child字段链接的ctl_table实体)以提供完整的路径,然后把代表你刚建立的树根的ctl_table实体传输给register_sysctl_table.当该树的任何节点尚未存在时,就会被创建.
来看一个简答的实例.下面这段代码显示文件 logging_level的定义以及如何放置到/proc/sys/dev/scsi目录的:
drivers/scsi/scsi_sysctl.c:
15 static struct ctl_table scsi_table[] = {
16 { .procname = "logging_level",
17 .data = &scsi_logging_level,
18 .maxlen = sizeof(scsi_logging_level),
19 .mode = 0644,
20 .proc_handler = proc_dointvec },
21 { }
22 };
23
24 static struct ctl_table scsi_dir_table[] = {
25 { .procname = "scsi",
26 .mode = 0555,
27 .child = scsi_table },
28 { }
29 };
30
31 static struct ctl_table scsi_root_table[] = {
32 { .procname = "dev",
33 .mode = 0555,
34 .child = scsi_dir_table },
35 { }
36 };
37
38 static struct ctl_table_header *scsi_table_header;
39
40 int __init scsi_init_sysctl(void)
41 {
42 scsi_table_header = register_sysctl_table(scsi_root_table);
43 if (!scsi_table_header)
44 return -ENOMEM;
45 return 0;
46 }
47
注意,register_sysctl_table接收的是scsi_root_table,也就是代码所定义的ctl_table树的树根.
如果稍后你想把另一个文件添加到同一个目录,例如abc文件,你需要定义一棵类似的树.也就是两个与dev和scsi目录相同的实体,外加上新文件abc的新的ctl_table实体.不过这里重复了dev,scsi目录的创建过程了.如果想重用该怎么做呢?
有时候,开发人员为了简化把新文件添加到已存在的目录而定义一个模板,然后每次有新文件添加到相同目录的时候,就对相同的目录予以重用.使用模板的好处是ctl_table实体只需要初始化一次便可以贯穿整个目录,之后梅增加一个新文件时,只需要对叶节点初始化.
例如,可以看看邻居子系统如何使用在net/core/negihbour.c中的neigh_sysctl_register定义neigh_sysctl_template的.
net/core/neighbour.c:
定义一个模板neigh_sysctl_template
static struct neigh_sysctl_table {
3054 struct ctl_table_header *sysctl_header;
3055 struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
3056 } neigh_sysctl_template __read_mostly = {
3057 .neigh_vars = {
3058 NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
3059 NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"),
3060 NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"),
3061 NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"),
3062 NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
3063 NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
3064 NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
3065 NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
3066 NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
3067 NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
3068 NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"),
3069 NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"),
3070 NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"),
3071 NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"),
3072 NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"),
3073 NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"),
3074 [NEIGH_VAR_GC_INTERVAL] = {
3075 .procname = "gc_interval",
3076 .maxlen = sizeof(int),
3077 .mode = 0644,
3078 .proc_handler = proc_dointvec_jiffies,
3079 },
3080 [NEIGH_VAR_GC_THRESH1] = {
3081 .procname = "gc_thresh1",
3082 .maxlen = sizeof(int),
3083 .mode = 0644,
3084 .extra1 = &zero,
3085 .extra2 = &int_max,
3086 .proc_handler = proc_dointvec_minmax,
3087 },
3088 [NEIGH_VAR_GC_THRESH2] = {
3089 .procname = "gc_thresh2",
3090 .maxlen = sizeof(int),
3091 .mode = 0644,
3092 .extra1 = &zero,
3093 .extra2 = &int_max,
3094 .proc_handler = proc_dointvec_minmax,
3095 },
3096 [NEIGH_VAR_GC_THRESH3] = {
3097 .procname = "gc_thresh3",
3098 .maxlen = sizeof(int),
3099 .mode = 0644,
3100 .extra1 = &zero,
3101 .extra2 = &int_max,
3102 .proc_handler = proc_dointvec_minmax,
3103 },
3104 {},
3105 },
3106 };
3107
3108 int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
3109 proc_handler *handler)
3110 {
3111 int i;
3112 struct neigh_sysctl_table *t;
3113 const char *dev_name_source;
3114 char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
3115 char *p_name;
3116
3117 t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
3118 if (!t)
3119 goto err;
3120
3121 for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) {
3122 t->neigh_vars[i].data += (long) p;
3123 t->neigh_vars[i].extra1 = dev;
3124 t->neigh_vars[i].extra2 = p;
3125 }
3126
3127 if (dev) {
3128 dev_name_source = dev->name;
3129 /* Terminate the table early */
3130 memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
3131 sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
3132 } else {
3133 struct neigh_table *tbl = p->tbl;
3134 dev_name_source = "default";
3135 t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval;
3136 t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1;
3137 t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2;
3138 t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3;
3139 }
3140
3141 if (handler) {
3142 /* RetransTime */
3143 t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
3144 /* ReachableTime */
3145 t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
3146 /* RetransTime (in milliseconds)*/
3147 t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
3148 /* ReachableTime (in milliseconds) */
3149 t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
3150 } else {
3151 /* Those handlers will update p->reachable_time after
3152 * base_reachable_time(_ms) is set to ensure the new timer starts being
3153 * applied after the next neighbour update instead of waiting for
3154 * neigh_periodic_work to update its value (can be multiple minutes)
3155 * So any handler that replaces them should do this as well
3156 */
3157 /* ReachableTime */
3158 t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler =
3159 neigh_proc_base_reachable_time;
3160 /* ReachableTime (in milliseconds) */
3161 t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler =
3162 neigh_proc_base_reachable_time;
3163 }
3164
3165 /* Don't export sysctls to unprivileged users */
3166 if (neigh_parms_net(p)->user_ns != &init_user_ns)
3167 t->neigh_vars[0].procname = NULL;
3168
3169 switch (neigh_parms_family(p)) {
3170 case AF_INET:
3171 p_name = "ipv4";
3172 break;
3173 case AF_INET6:
3174 p_name = "ipv6";
3175 break;
3176 default:
3177 BUG();
3178 }
3179
3180 snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
3181 p_name, dev_name_source);
3182 t->sysctl_header =
3183 register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars);
3184 if (!t->sysctl_header)
3185 goto free;
3186
3187 p->sysctl_table = t;
3188 return 0;
3189
3190 free:
3191 kfree(t);
3192 err:
3193 return -ENOBUFS;
3194 }
3195 EXPORT_SYMBOL(neigh_sysctl_register);
3196
不过使用模板的neigh_sysctl_register()代码没有看懂,欢迎交流学习
email:
happy.xhx@163.com
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。