赞
踩
目录
2.3.3 BPF_FUNC_map_lookup_elem
static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size)
{
int fd;
do {
fd = sys_bpf(BPF_PROG_LOAD, attr, size);
} while (fd < 0 && errno == EAGAIN);
return fd;
}
int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len)
{
union bpf_attr attr;
int err;
memset(&attr, 0, sizeof(attr));
attr.info.bpf_fd = prog_fd;
attr.info.info_len = *info_len;
attr.info.info = ptr_to_u64(info);
err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
if (!err)
*info_len = attr.info.info_len;
return err;
}
使用的是netlink消息跟内核通信,把fd和dev信息以及flag发送到内核,netlink使用的是NETLINK_ROUTE
int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
{
int sock, seq = 0, ret;
struct nlattr *nla, *nla_xdp;
struct {
struct nlmsghdr nh;
struct ifinfomsg ifinfo;
char attrbuf[64];
} req;
__u32 nl_pid;
sock = libbpf_netlink_open(&nl_pid);
if (sock < 0)
return sock;
memset(&req, 0, sizeof(req));
req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
req.nh.nlmsg_type = RTM_SETLINK;
req.nh.nlmsg_pid = 0;
req.nh.nlmsg_seq = ++seq;
req.ifinfo.ifi_family = AF_UNSPEC;
req.ifinfo.ifi_index = ifindex;
/* started nested attribute for XDP */
nla = (struct nlattr *)(((char *)&req)
+ NLMSG_ALIGN(req.nh.nlmsg_len));
nla->nla_type = NLA_F_NESTED | IFLA_XDP;
nla->nla_len = NLA_HDRLEN;
/* add XDP fd */
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_FD;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
nla->nla_len += nla_xdp->nla_len;
/* if user passed in any flags, add those too */
if (flags) {
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_FLAGS;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
nla->nla_len += nla_xdp->nla_len;
}
req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
ret = -errno;
goto cleanup;
}
ret = bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL);
cleanup:
close(sock);
return ret;
}
int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
{
union bpf_attr attr;
memset(&attr, '\0', sizeof(attr));
attr.map_type = create_attr->map_type;
attr.key_size = create_attr->key_size;
attr.value_size = create_attr->value_size;
attr.max_entries = create_attr->max_entries;
attr.map_flags = create_attr->map_flags;
if (create_attr->name)
memcpy(attr.map_name, create_attr->name,
min(strlen(create_attr->name), BPF_OBJ_NAME_LEN - 1));
attr.numa_node = create_attr->numa_node;
attr.btf_fd = create_attr->btf_fd;
attr.btf_key_type_id = create_attr->btf_key_type_id;
attr.btf_value_type_id = create_attr->btf_value_type_id;
attr.map_ifindex = create_attr->map_ifindex;
attr.inner_map_fd = create_attr->inner_map_fd;
return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
int bpf_map_lookup_elem(int fd, const void *key, void *value)
{
union bpf_attr attr;
memset(&attr, 0, sizeof(attr));
attr.map_fd = fd;
attr.key = ptr_to_u64(key);
attr.value = ptr_to_u64(value);
return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}
int bpf_obj_pin(int fd, const char *pathname)
{
union bpf_attr attr;
memset(&attr, 0, sizeof(attr));
attr.pathname = ptr_to_u64((void *)pathname);
attr.bpf_fd = fd;
return sys_bpf(BPF_OBJ_PIN, &attr, sizeof(attr));
}
int bpf_obj_get(const char *pathname)
{
union bpf_attr attr;
memset(&attr, 0, sizeof(attr));
attr.pathname = ptr_to_u64((void *)pathname);
return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
}
目前支持的CMD类型
- /* BPF syscall commands, see bpf(2) man-page for details. */
- enum bpf_cmd {
- BPF_MAP_CREATE,
- BPF_MAP_LOOKUP_ELEM,
- BPF_MAP_UPDATE_ELEM,
- BPF_MAP_DELETE_ELEM,
- BPF_MAP_GET_NEXT_KEY,
- BPF_PROG_LOAD,
- BPF_OBJ_PIN,
- BPF_OBJ_GET,
- BPF_PROG_ATTACH,
- BPF_PROG_DETACH,
- BPF_PROG_TEST_RUN,
- BPF_PROG_GET_NEXT_ID,
- BPF_MAP_GET_NEXT_ID,
- BPF_PROG_GET_FD_BY_ID,
- BPF_MAP_GET_FD_BY_ID,
- BPF_OBJ_GET_INFO_BY_FD,
- BPF_PROG_QUERY,
- BPF_RAW_TRACEPOINT_OPEN,
- BPF_BTF_LOAD,
- BPF_BTF_GET_FD_BY_ID,
- BPF_TASK_FD_QUERY,
- };
- kernel/bpf/syscall.c
-
- SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
- {
- union bpf_attr attr;
- int err;
- if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
- return -EPERM;
- err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
- if (err)
- return err;
- size = min_t(u32, size, sizeof(attr));
- /* copy attributes from user space, may be less than sizeof(bpf_attr) */
- memset(&attr, 0, sizeof(attr));
- if (copy_from_user(&attr, uattr, size) != 0)
- return -EFAULT;
- err = security_bpf(cmd, &attr, size);
- if (err < 0)
- return err;
- switch (cmd) {
- case BPF_MAP_CREATE:
- err = map_create(&attr);
- break;
- case BPF_MAP_LOOKUP_ELEM:
- err = map_lookup_elem(&attr);
- break;
- case BPF_MAP_UPDATE_ELEM:
- err = map_update_elem(&attr);
- break;
- case BPF_MAP_DELETE_ELEM:
- err = map_delete_elem(&attr);
- break;
- case BPF_MAP_GET_NEXT_KEY:
- err = map_get_next_key(&attr);
- break;
- case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr);
- break;
- case BPF_OBJ_PIN:
- err = bpf_obj_pin(&attr);
- break;
- case BPF_OBJ_GET:
- err = bpf_obj_get(&attr);
- break;
- case BPF_PROG_ATTACH:
- err = bpf_prog_attach(&attr);
- break;
- case BPF_PROG_DETACH:
- err = bpf_prog_detach(&attr);
- break;
- case BPF_PROG_QUERY:
- err = bpf_prog_query(&attr, uattr);
- break;
- case BPF_PROG_TEST_RUN:
- err = bpf_prog_test_run(&attr, uattr);
- break;
- case BPF_PROG_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
- &prog_idr, &prog_idr_lock);
- break;
- case BPF_MAP_GET_NEXT_ID:
- err = bpf_obj_get_next_id(&attr, uattr,
- &map_idr, &map_idr_lock);
- break;
- case BPF_PROG_GET_FD_BY_ID:
- err = bpf_prog_get_fd_by_id(&attr);
- break;
- case BPF_MAP_GET_FD_BY_ID:
- err = bpf_map_get_fd_by_id(&attr);
- break;
- case BPF_OBJ_GET_INFO_BY_FD:
- err = bpf_obj_get_info_by_fd(&attr, uattr);
- break;
- case BPF_RAW_TRACEPOINT_OPEN:
- err = bpf_raw_tracepoint_open(&attr);
- break;
- case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr);
- break;
- case BPF_BTF_GET_FD_BY_ID:
- err = bpf_btf_get_fd_by_id(&attr);
- break;
- case BPF_TASK_FD_QUERY:
- err = bpf_task_fd_query(&attr, uattr);
- break;
- default:
- err = -EINVAL;
- break;
- }
- return err;
- }
BPF_PROG_LOAD命令负责加载一段BPF程序到内核当中
完成这一切后,后续再把这段BPF程序挂载到需要运行的钩子上面
目前支持的prog类型
- enum bpf_prog_type {
- BPF_PROG_TYPE_UNSPEC,
- BPF_PROG_TYPE_SOCKET_FILTER,
- BPF_PROG_TYPE_KPROBE,
- BPF_PROG_TYPE_SCHED_CLS,
- BPF_PROG_TYPE_SCHED_ACT,
- BPF_PROG_TYPE_TRACEPOINT,
- BPF_PROG_TYPE_XDP,
- BPF_PROG_TYPE_PERF_EVENT,
- BPF_PROG_TYPE_CGROUP_SKB,
- BPF_PROG_TYPE_CGROUP_SOCK,
- BPF_PROG_TYPE_LWT_IN,
- BPF_PROG_TYPE_LWT_OUT,
- BPF_PROG_TYPE_LWT_XMIT,
- BPF_PROG_TYPE_SOCK_OPS,
- BPF_PROG_TYPE_SK_SKB,
- BPF_PROG_TYPE_CGROUP_DEVICE,
- BPF_PROG_TYPE_SK_MSG,
- BPF_PROG_TYPE_RAW_TRACEPOINT,
- BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
- BPF_PROG_TYPE_LWT_SEG6LOCAL,
- BPF_PROG_TYPE_LIRC_MODE2,
- BPF_PROG_TYPE_SK_REUSEPORT,
- };
- static int bpf_prog_load(union bpf_attr *attr)
- {
- enum bpf_prog_type type = attr->prog_type;
- struct bpf_prog *prog;
- int err;
- char license[128];
- bool is_gpl;
- if (CHECK_ATTR(BPF_PROG_LOAD))
- return -EINVAL;
- if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
- return -EINVAL;
- /* copy eBPF program license from user space */
- /* (1.1) 根据attr->license地址,从用户空间拷贝license字符串到内核 */
- if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
- sizeof(license) - 1) < 0)
- return -EFAULT;
- license[sizeof(license) - 1] = 0;
- /* eBPF programs must be GPL compatible to use GPL-ed functions */
- /* (1.2) 判断license是否符合GPL协议 */
- is_gpl = license_is_gpl_compatible(license);
-
- /* (1.3) 判断BPF的总指令数是否超过BPF_MAXINSNS(4k),在5.8版本有修改,指令数量扩大到1M */
- if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
- return -E2BIG;
- if (type == BPF_PROG_TYPE_KPROBE &&
- attr->kern_version != LINUX_VERSION_CODE)
- return -EINVAL;
- if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
- type != BPF_PROG_TYPE_CGROUP_SKB &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
- bpf_prog_load_fixup_attach_type(attr);
- if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type))
- return -EINVAL;
- /* plain bpf_prog allocation */
- prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
- if (!prog)
- return -ENOMEM;
- prog->expected_attach_type = attr->expected_attach_type;
- prog->aux->offload_requested = !!attr->prog_ifindex;
- err = security_bpf_prog_alloc(prog->aux);
- if (err)
- goto free_prog_nouncharge;
- err = bpf_prog_charge_memlock(prog);
- if (err)
- goto free_prog_sec;
- prog->len = attr->insn_cnt;
- err = -EFAULT;
- if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
- bpf_prog_insn_size(prog)) != 0)
- goto free_prog;
- prog->orig_prog = NULL;
- prog->jited = 0;
- atomic_set(&prog->aux->refcnt, 1);
- prog->gpl_compatible = is_gpl ? 1 : 0;
- if (bpf_prog_is_dev_bound(prog->aux)) {
- err = bpf_prog_offload_init(prog, attr);
- if (err)
- goto free_prog;
- }
- /* find program type: socket_filter vs tracing_filter */
- err = find_prog_type(type, prog);
- if (err < 0)
- goto free_prog;
- prog->aux->load_time = ktime_get_boot_ns();
- err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
- if (err)
- goto free_prog;
- /* run eBPF verifier */
- /* (3) 使用verifer对BPF程序进行合法性扫描 */
- err = bpf_check(&prog, attr);
- if (err < 0)
- goto free_used_maps;
- /* (4) 尝试对BPF程序进行JIT转换 */
- prog = bpf_prog_select_runtime(prog, &err);
- if (err < 0)
- goto free_used_maps;
- err = bpf_prog_alloc_id(prog);
- if (err)
- goto free_used_maps;
- /* Upon success of bpf_prog_alloc_id(), the BPF prog is
- * effectively publicly exposed. However, retrieving via
- * bpf_prog_get_fd_by_id() will take another reference,
- * therefore it cannot be gone underneath us.
- *
- * Only for the time /after/ successful bpf_prog_new_fd()
- * and before returning to userspace, we might just hold
- * one reference and any parallel close on that fd could
- * rip everything out. Hence, below notifications must
- * happen before bpf_prog_new_fd().
- *
- * Also, any failure handling from this point onwards must
- * be using bpf_prog_put() given the program is exposed.
- */
- bpf_prog_kallsyms_add(prog);
- /* (5) 给BPF程序分配一个文件句柄fd */
- err = bpf_prog_new_fd(prog);
- if (err < 0)
- bpf_prog_put(prog);
- return err; /* (6) 返回fd句柄 */
- free_used_maps:
- bpf_prog_kallsyms_del_subprogs(prog);
- free_used_maps(prog->aux);
- free_prog:
- bpf_prog_uncharge_memlock(prog);
- free_prog_sec:
- security_bpf_prog_free(prog->aux);
- free_prog_nouncharge:
- bpf_prog_free(prog);
- return err;
- }
- 1.2 申请 bpf_prog 结构
- struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
- {
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
- struct bpf_prog_aux *aux;
- struct bpf_prog *fp;
- size = round_up(size, PAGE_SIZE);
- fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
- if (fp == NULL)
- return NULL;
- aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
- if (aux == NULL) {
- vfree(fp);
- return NULL;
- }
- fp->pages = size / PAGE_SIZE;
- fp->aux = aux;
- fp->aux->prog = fp;
- (1.1) 是否打开JIT
- fp->jit_requested = ebpf_jit_enabled();
- INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
- return fp;
- }
- 1.3 选择运行时
- struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
- {
- /* In case of BPF to BPF calls, verifier did all the prep
- * work with regards to JITing, etc.
- */
- if (fp->bpf_func)
- goto finalize;
- (1.1) 注册执行函数,在不支持JIT时执行函数,默认是__bpf_prog_run
- bpf_prog_select_func(fp);
- /* eBPF JITs can rewrite the program in case constant
- * blinding is active. However, in case of error during
- * blinding, bpf_int_jit_compile() must always return a
- * valid program, which in this case would simply not
- * be JITed, but falls back to the interpreter.
- */
- if (!bpf_prog_is_dev_bound(fp->aux)) {
- (1.2) 开始JIT编译,重新注册执行函数入口
- fp = bpf_int_jit_compile(fp);
- #ifdef CONFIG_BPF_JIT_ALWAYS_ON
- if (!fp->jited) {
- *err = -ENOTSUPP;
- return fp;
- }
- #endif
- } else {
- *err = bpf_prog_offload_compile(fp);
- if (*err)
- return fp;
- }
- finalize:
- bpf_prog_lock_ro(fp);
- /* The tail call compatibility check can only be done at
- * this late stage as we need to determine, if we deal
- * with JITed or non JITed program concatenations and not
- * all eBPF JITs might immediately support all features.
- */
- *err = bpf_check_tail_call(fp);
- return fp;
- }
-
- 不论是转换成JIT的映像,或者是使用interpreter解释器。最后BPF程序运行的时候都是使用BPF_PROG_RUN()这个宏来调用的
- ret = BPF_PROG_RUN(prog, ctx);
- #define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi)
BPF来说有个重要的数据结构就是struct bpf_prog
- struct bpf_prog {
- u16 pages; /* Number of allocated pages */
- kmemcheck_bitfield_begin(meta);
- u16 jited:1, /* Is our filter JIT'ed? */
- gpl_compatible:1, /* Is filter GPL compatible? */
- cb_access:1, /* Is control block accessed? */
- dst_needed:1; /* Do we need dst entry? */
- kmemcheck_bitfield_end(meta);
- u32 len; /* Number of filter blocks */
- enum bpf_prog_type type; /* Type of BPF program */
- struct bpf_prog_aux *aux; /* Auxiliary fields */
- struct sock_fprog_kern *orig_prog; /* Original BPF program */
- unsigned int (*bpf_func)(const struct sk_buff *skb,
- const struct bpf_insn *filter);
- /* Instructions for interpreter */
- union {
- struct sock_filter insns[0];
- struct bpf_insn insnsi[0];
- };
- };
其中重要的成员如下:
BPF map的应用场景有几种:
目前支持的BPF MAP类型
- enum bpf_map_type {
- BPF_MAP_TYPE_UNSPEC,
- BPF_MAP_TYPE_HASH,
- BPF_MAP_TYPE_ARRAY,
- BPF_MAP_TYPE_PROG_ARRAY,
- BPF_MAP_TYPE_PERF_EVENT_ARRAY,
- BPF_MAP_TYPE_PERCPU_HASH,
- BPF_MAP_TYPE_PERCPU_ARRAY,
- BPF_MAP_TYPE_STACK_TRACE,
- BPF_MAP_TYPE_CGROUP_ARRAY,
- BPF_MAP_TYPE_LRU_HASH,
- BPF_MAP_TYPE_LRU_PERCPU_HASH,
- BPF_MAP_TYPE_LPM_TRIE,
- BPF_MAP_TYPE_ARRAY_OF_MAPS,
- BPF_MAP_TYPE_HASH_OF_MAPS,
- BPF_MAP_TYPE_DEVMAP,
- BPF_MAP_TYPE_SOCKMAP,
- BPF_MAP_TYPE_CPUMAP,
- BPF_MAP_TYPE_XSKMAP,
- BPF_MAP_TYPE_SOCKHASH,
- BPF_MAP_TYPE_CGROUP_STORAGE,
- BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
- BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
- BPF_MAP_TYPE_QUEUE,
- BPF_MAP_TYPE_STACK,
- BPF_MAP_TYPE_SK_STORAGE,
- BPF_MAP_TYPE_DEVMAP_HASH,
- };
不论哪种map,对map的使用都是用"键-值“对(key-value)的形式来使用的
用户态的loader在加载BPF程序的时候,首先会根据__section(“maps”)中的成员来调用bpf()系统调用来创建map对象。
- /* called via syscall */
- static int map_create(union bpf_attr *attr)
- {
- int numa_node = bpf_map_attr_numa_node(attr);
- struct bpf_map *map;
- int f_flags;
- int err;
- err = CHECK_ATTR(BPF_MAP_CREATE);
- if (err)
- return -EINVAL;
- f_flags = bpf_get_file_flag(attr->map_flags);
- if (f_flags < 0)
- return f_flags;
- if (numa_node != NUMA_NO_NODE &&
- ((unsigned int)numa_node >= nr_node_ids ||
- !node_online(numa_node)))
- return -EINVAL;
- /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
-
- /* (1) 根据map的类型分配空间 */
- map = find_and_alloc_map(attr);
- if (IS_ERR(map))
- return PTR_ERR(map);
- err = bpf_obj_name_cpy(map->name, attr->map_name);
- if (err)
- goto free_map_nouncharge;
- atomic_set(&map->refcnt, 1);
- atomic_set(&map->usercnt, 1);
- if (attr->btf_key_type_id || attr->btf_value_type_id) {
- struct btf *btf;
- if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
- err = -EINVAL;
- goto free_map_nouncharge;
- }
- btf = btf_get_by_fd(attr->btf_fd);
- if (IS_ERR(btf)) {
- err = PTR_ERR(btf);
- goto free_map_nouncharge;
- }
- err = map_check_btf(map, btf, attr->btf_key_type_id,
- attr->btf_value_type_id);
- if (err) {
- btf_put(btf);
- goto free_map_nouncharge;
- }
- map->btf = btf;
- map->btf_key_type_id = attr->btf_key_type_id;
- map->btf_value_type_id = attr->btf_value_type_id;
- }
- err = security_bpf_map_alloc(map);
- if (err)
- goto free_map_nouncharge;
-
- /* (2) 在进程vm中给map锁定空间 */
- err = bpf_map_init_memlock(map);
- if (err)
- goto free_map_sec;
- err = bpf_map_alloc_id(map);
- if (err)
- goto free_map;
- /* (3) 给map分配对应的文件句柄 */
- err = bpf_map_new_fd(map, f_flags);
- if (err < 0) {
- /* failed to allocate fd.
- * bpf_map_put_with_uref() is needed because the above
- * bpf_map_alloc_id() has published the map
- * to the userspace and the userspace may
- * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
- */
- bpf_map_put_with_uref(map);
- return err;
- }
- return err;
- free_map:
- bpf_map_release_memlock(map);
- free_map_sec:
- security_bpf_map_free(map);
- free_map_nouncharge:
- btf_put(map->btf);
- map->ops->map_free(map);
- return err;
- }
以BPF_MAP_TYPE_ARRAY类型的map为例,来看看map的分配过程:
从用户态传过来的attr成员意义如下:
attr->map_type:map的类型;
attr->key_size:键key成员的大小;
attr->value_size:值value成员的大小;
attr->max_entries:需要存储多少个条目("键-值“对)
array_map.c文件中定义了各种类型的map操作集合,以BPF_MAP_TYPE_ARRAY类型为例进行说明
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_gen_lookup = array_map_gen_lookup,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
};
查找就是通过key来找到对应的value
- static int map_lookup_elem(union bpf_attr *attr)
- {
- void __user *ukey = u64_to_user_ptr(attr->key);
- void __user *uvalue = u64_to_user_ptr(attr->value);
- int ufd = attr->map_fd;
- struct bpf_map *map;
- void *key, *value, *ptr;
- u32 value_size;
- struct fd f;
- int err;
- if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
- return -EINVAL;
- f = fdget(ufd);
- map = __bpf_map_get(f);
- if (IS_ERR(map))
- return PTR_ERR(map);
- if (!(f.file->f_mode & FMODE_CAN_READ)) {
- err = -EPERM;
- goto err_put;
- }
- key = memdup_user(ukey, map->key_size);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto err_put;
- }
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
- value_size = round_up(map->value_size, 8) * num_possible_cpus();
- else if (IS_FD_MAP(map))
- value_size = sizeof(u32);
- else
- value_size = map->value_size;
- err = -ENOMEM;
- value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
- if (!value)
- goto free_key;
- if (bpf_map_is_dev_bound(map)) {
- err = bpf_map_offload_lookup_elem(map, key, value);
- goto done;
- }
- preempt_disable();
- this_cpu_inc(bpf_prog_active);
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
- err = bpf_stackmap_copy(map, key, value);
- } else if (IS_FD_ARRAY(map)) {
- err = bpf_fd_array_map_lookup_elem(map, key, value);
- } else if (IS_FD_HASH(map)) {
- err = bpf_fd_htab_map_lookup_elem(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
- err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
- } else {
- rcu_read_lock();
- if (map->ops->map_lookup_elem_sys_only)
- ptr = map->ops->map_lookup_elem_sys_only(map, key);
- else
- ptr = map->ops->map_lookup_elem(map, key);
- if (ptr)
- memcpy(value, ptr, value_size);
- rcu_read_unlock();
- err = ptr ? 0 : -ENOENT;
- }
- this_cpu_dec(bpf_prog_active);
- preempt_enable();
- done:
- if (err)
- goto free_value;
- err = -EFAULT;
- if (copy_to_user(uvalue, value, value_size) != 0)
- goto free_value;
- err = 0;
- free_value:
- kfree(value);
- free_key:
- kfree(key);
- err_put:
- fdput(f);
- return err;
- }
针对每种map类型调用其对应的ops注册的查询操作,BPF_MAP_TYPE_ARRAY类型的map最终调用到array_map_lookup_elem():
- /* Called from syscall or from eBPF program */
- static void *array_map_lookup_elem(struct bpf_map *map, void *key)
- {
- struct bpf_array *array = container_of(map, struct bpf_array, map);
- u32 index = *(u32 *)key;
- if (unlikely(index >= array->map.max_entries))
- return NULL;
- return array->value + array->elem_size * (index & array->index_mask);
- }
除了用户态空间需要通过bpf()系统调用来查找key对应的value值。BPF程序中也需要根据key查找到value的地址,然后在BPF程序中使用。BPF程序时通过调用BPF_FUNC_map_lookup_elem helper function来实现的。
- static const struct bpf_func_proto *
- bpf_base_func_proto(enum bpf_func_id func_id)
- {
- switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_get_prandom_u32:
- return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_get_smp_processor_id:
- return &bpf_get_raw_smp_processor_id_proto;
- case BPF_FUNC_get_numa_node_id:
- return &bpf_get_numa_node_id_proto;
- case BPF_FUNC_tail_call:
- return &bpf_tail_call_proto;
- case BPF_FUNC_ktime_get_ns:
- return &bpf_ktime_get_ns_proto;
- case BPF_FUNC_trace_printk:
- if (capable(CAP_SYS_ADMIN))
- return bpf_get_trace_printk_proto();
- /* else: fall through */
- default:
- return NULL;
- }
- }
-
- BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
- {
- WARN_ON_ONCE(!rcu_read_lock_held());
- return (unsigned long) map->ops->map_lookup_elem(map, key);
- }
- const struct bpf_func_proto bpf_map_lookup_elem_proto = {
- .func = bpf_map_lookup_elem,
- .gpl_only = false,
- .pkt_access = true,
- .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
- };
和bpf()系统调用一样,最后调用的都是map->ops->map_lookup_elem()函数,只不过BPF程序需要返回的是value的指针,而bpf()系统调用需要返回的是value的值
BPF map 和程序作为内核资源只能通过文件描述符访问,其背后是内核中的匿名 inode。这带来了很多优点,但同时也有很多缺点:
优点包括:用户空间应用能够使用大部分文件描述符相关的 API,传递给 Unix socket 的文 件描述符是透明工作的等等。也有一系列的坏处:因为fd生存在进程空间的,其他进程不能访问,而且一旦本进程退出,这些对象都会处于失联状态无法访问。
因此,这给某些特定的场景带来了很多复杂性,例如 iproute2,其中的 tc 或 XDP 在准备 环境、加载程序到内核之后最终会退出。在这种情况下,从用户空间也无法访问这些 map 了,而本来这些 map 其实是很有用的,例如,在 data path 的 ingress 和 egress 位置共 享的 map(可以统计包数、字节数、PPS 等信息)。另外,第三方应用可能希望在 BPF 程 序运行时监控或更新 map。
为了解决这个问题,内核实现了一个最小内核空间 BPF 文件系统,BPF map 和 BPF 程序 都可以钉到(pin)这个文件系统内,这个过程称为 object pinning(钉住对象)。相应 地,BPF 系统调用进行了扩展,添加了两个新命令,分别用于钉住(BPF_OBJ_PIN)一个 对象和获取(BPF_OBJ_GET)一个被钉住的对象(pinned objects)
具体的做法是把这些对象绑定到一个专用的文件系统当中
# ls /sys/fs/bpf/
#
- static int bpf_obj_pin(const union bpf_attr *attr)
- {
- if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
- return -EINVAL;
- return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
- }
- int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
- {
- struct filename *pname;
- enum bpf_type type;
- void *raw;
- int ret;
-
- /* (1) 根据字符串获取路径 */
- pname = getname(pathname);
- if (IS_ERR(pname))
- return PTR_ERR(pname);
-
- /* (2) 根据fd获取到bpf_map/bpf_prog对象 */
- raw = bpf_fd_probe_obj(ufd, &type);
- if (IS_ERR(raw)) {
- ret = PTR_ERR(raw);
- goto out;
- }
-
- /* (3) 创建文件节点,和bpf对象联结起来 */
- ret = bpf_obj_do_pin(pname, raw, type);
- if (ret != 0)
- bpf_any_put(raw, type);
- out:
- putname(pname);
- return ret;
- }
-
- static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
- enum bpf_type type)
- {
- struct dentry *dentry;
- struct inode *dir;
- struct path path;
- umode_t mode;
- int ret;
- dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
- ret = security_path_mknod(&path, dentry, mode, 0);
- if (ret)
- goto out;
- dir = d_inode(path.dentry);
- if (dir->i_op != &bpf_dir_iops) {
- ret = -EPERM;
- goto out;
- }
- switch (type) {
- case BPF_TYPE_PROG:
- ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
- break;
- case BPF_TYPE_MAP:
- ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
- break;
- default:
- ret = -EPERM;
- }
- out:
- done_path_create(&path, dentry);
- return ret;
- }
把map对应的句柄fd与文件关联起来,通过文件就可以访问到具体的fd。
通过文件名称获取本进程中的句柄fd
- static int bpf_obj_get(const union bpf_attr *attr)
- {
- if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
- attr->file_flags & ~BPF_OBJ_FLAG_MASK)
- return -EINVAL;
- return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
- attr->file_flags);
- }
-
- int bpf_obj_get_user(const char __user *pathname, int flags)
- {
- enum bpf_type type = BPF_TYPE_UNSPEC;
- struct filename *pname;
- int ret = -ENOENT;
- int f_flags;
- void *raw;
- f_flags = bpf_get_file_flag(flags);
- if (f_flags < 0)
- return f_flags;
- /* (1) 根据字符串获取路径 */
- pname = getname(pathname);
- if (IS_ERR(pname))
- return PTR_ERR(pname);
- /* (2) 根据路径,在对应inode中找到bpf对象的map指针和type */
- raw = bpf_obj_do_get(pname, &type, f_flags);
- if (IS_ERR(raw)) {
- ret = PTR_ERR(raw);
- goto out;
- }
-
- /* (3) 根据对象type,在本进程中给bpf对象分配一个fd */
- if (type == BPF_TYPE_PROG)
- ret = bpf_prog_new_fd(raw);
- else if (type == BPF_TYPE_MAP)
- ret = bpf_map_new_fd(raw, f_flags);
- else
- goto out;
- if (ret < 0)
- bpf_any_put(raw, type);
- out:
- putname(pname);
- return ret;
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。