当前位置:   article > 正文

ebfp编程常用API介绍_ebpf 常用api

ebpf 常用api

目录

1 用户态API

1.1 加载prog

1.2 查询prog的信息

1.3 prog绑定到固定的dev上

1.4 创建MAP

1.5 查询MAP

1.6 Object Pinning(钉住对象)

2 内核态API

2.1 内核总体API

2.2 bpf 加载

2.2.1 prog 加载

2.3 bpf map 操作

2.3.1 map 创建

2.3.2 map 查找

2.3.3 BPF_FUNC_map_lookup_elem

2.4 obj pin(盯住对象)

 2.4.1 bpf_obj_do_pin

 2.4.2 bpf_obj_get


1 用户态API

1.1 加载prog

static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size)
{
int fd;

do {
fd = sys_bpf(BPF_PROG_LOAD, attr, size);
} while (fd < 0 && errno == EAGAIN);

return fd;
}

1.2 查询prog的信息

int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len)
{
union bpf_attr attr;
int err;

memset(&attr, 0, sizeof(attr));
attr.info.bpf_fd = prog_fd;
attr.info.info_len = *info_len;
attr.info.info = ptr_to_u64(info);

err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
if (!err)
*info_len = attr.info.info_len;

return err;
}

1.3 prog绑定到固定的dev上

使用的是netlink消息跟内核通信,把fd和dev信息以及flag发送到内核,netlink使用的是NETLINK_ROUTE

int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
{
int sock, seq = 0, ret;
struct nlattr *nla, *nla_xdp;
struct {
struct nlmsghdr nh;
struct ifinfomsg ifinfo;
char attrbuf[64];
} req;
__u32 nl_pid;

sock = libbpf_netlink_open(&nl_pid);
if (sock < 0)
return sock;

memset(&req, 0, sizeof(req));
req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
req.nh.nlmsg_type = RTM_SETLINK;
req.nh.nlmsg_pid = 0;
req.nh.nlmsg_seq = ++seq;
req.ifinfo.ifi_family = AF_UNSPEC;
req.ifinfo.ifi_index = ifindex;

/* started nested attribute for XDP */
nla = (struct nlattr *)(((char *)&req)
+ NLMSG_ALIGN(req.nh.nlmsg_len));
nla->nla_type = NLA_F_NESTED | IFLA_XDP;
nla->nla_len = NLA_HDRLEN;

/* add XDP fd */
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_FD;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
nla->nla_len += nla_xdp->nla_len;

/* if user passed in any flags, add those too */
if (flags) {
nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
nla_xdp->nla_type = IFLA_XDP_FLAGS;
nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
nla->nla_len += nla_xdp->nla_len;
}

req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);

if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
ret = -errno;
goto cleanup;
}
ret = bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL);

cleanup:
close(sock);
return ret;
}

1.4 创建MAP

int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
{
union bpf_attr attr;

memset(&attr, '\0', sizeof(attr));

attr.map_type = create_attr->map_type;
attr.key_size = create_attr->key_size;
attr.value_size = create_attr->value_size;
attr.max_entries = create_attr->max_entries;
attr.map_flags = create_attr->map_flags;
if (create_attr->name)
memcpy(attr.map_name, create_attr->name,
min(strlen(create_attr->name), BPF_OBJ_NAME_LEN - 1));
attr.numa_node = create_attr->numa_node;
attr.btf_fd = create_attr->btf_fd;
attr.btf_key_type_id = create_attr->btf_key_type_id;
attr.btf_value_type_id = create_attr->btf_value_type_id;
attr.map_ifindex = create_attr->map_ifindex;
attr.inner_map_fd = create_attr->inner_map_fd;

return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}

1.5 查询MAP


int bpf_map_lookup_elem(int fd, const void *key, void *value)
{
union bpf_attr attr;

memset(&attr, 0, sizeof(attr));
attr.map_fd = fd;
attr.key = ptr_to_u64(key);
attr.value = ptr_to_u64(value);

return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}

1.6 Object Pinning(钉住对象)

int bpf_obj_pin(int fd, const char *pathname)
{
union bpf_attr attr;

memset(&attr, 0, sizeof(attr));
attr.pathname = ptr_to_u64((void *)pathname);
attr.bpf_fd = fd;

return sys_bpf(BPF_OBJ_PIN, &attr, sizeof(attr));
}

int bpf_obj_get(const char *pathname)
{
union bpf_attr attr;

memset(&attr, 0, sizeof(attr));
attr.pathname = ptr_to_u64((void *)pathname);

return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
}

目前支持的CMD类型

  1. /* BPF syscall commands, see bpf(2) man-page for details. */
  2. enum bpf_cmd {
  3. BPF_MAP_CREATE,
  4. BPF_MAP_LOOKUP_ELEM,
  5. BPF_MAP_UPDATE_ELEM,
  6. BPF_MAP_DELETE_ELEM,
  7. BPF_MAP_GET_NEXT_KEY,
  8. BPF_PROG_LOAD,
  9. BPF_OBJ_PIN,
  10. BPF_OBJ_GET,
  11. BPF_PROG_ATTACH,
  12. BPF_PROG_DETACH,
  13. BPF_PROG_TEST_RUN,
  14. BPF_PROG_GET_NEXT_ID,
  15. BPF_MAP_GET_NEXT_ID,
  16. BPF_PROG_GET_FD_BY_ID,
  17. BPF_MAP_GET_FD_BY_ID,
  18. BPF_OBJ_GET_INFO_BY_FD,
  19. BPF_PROG_QUERY,
  20. BPF_RAW_TRACEPOINT_OPEN,
  21. BPF_BTF_LOAD,
  22. BPF_BTF_GET_FD_BY_ID,
  23. BPF_TASK_FD_QUERY,
  24. };

2 内核态API

2.1 内核总体API

  1. kernel/bpf/syscall.c
  2. SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
  3. {
  4. union bpf_attr attr;
  5. int err;
  6. if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
  7. return -EPERM;
  8. err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
  9. if (err)
  10. return err;
  11. size = min_t(u32, size, sizeof(attr));
  12. /* copy attributes from user space, may be less than sizeof(bpf_attr) */
  13. memset(&attr, 0, sizeof(attr));
  14. if (copy_from_user(&attr, uattr, size) != 0)
  15. return -EFAULT;
  16. err = security_bpf(cmd, &attr, size);
  17. if (err < 0)
  18. return err;
  19. switch (cmd) {
  20. case BPF_MAP_CREATE:
  21. err = map_create(&attr);
  22. break;
  23. case BPF_MAP_LOOKUP_ELEM:
  24. err = map_lookup_elem(&attr);
  25. break;
  26. case BPF_MAP_UPDATE_ELEM:
  27. err = map_update_elem(&attr);
  28. break;
  29. case BPF_MAP_DELETE_ELEM:
  30. err = map_delete_elem(&attr);
  31. break;
  32. case BPF_MAP_GET_NEXT_KEY:
  33. err = map_get_next_key(&attr);
  34. break;
  35. case BPF_PROG_LOAD:
  36. err = bpf_prog_load(&attr);
  37. break;
  38. case BPF_OBJ_PIN:
  39. err = bpf_obj_pin(&attr);
  40. break;
  41. case BPF_OBJ_GET:
  42. err = bpf_obj_get(&attr);
  43. break;
  44. case BPF_PROG_ATTACH:
  45. err = bpf_prog_attach(&attr);
  46. break;
  47. case BPF_PROG_DETACH:
  48. err = bpf_prog_detach(&attr);
  49. break;
  50. case BPF_PROG_QUERY:
  51. err = bpf_prog_query(&attr, uattr);
  52. break;
  53. case BPF_PROG_TEST_RUN:
  54. err = bpf_prog_test_run(&attr, uattr);
  55. break;
  56. case BPF_PROG_GET_NEXT_ID:
  57. err = bpf_obj_get_next_id(&attr, uattr,
  58. &prog_idr, &prog_idr_lock);
  59. break;
  60. case BPF_MAP_GET_NEXT_ID:
  61. err = bpf_obj_get_next_id(&attr, uattr,
  62. &map_idr, &map_idr_lock);
  63. break;
  64. case BPF_PROG_GET_FD_BY_ID:
  65. err = bpf_prog_get_fd_by_id(&attr);
  66. break;
  67. case BPF_MAP_GET_FD_BY_ID:
  68. err = bpf_map_get_fd_by_id(&attr);
  69. break;
  70. case BPF_OBJ_GET_INFO_BY_FD:
  71. err = bpf_obj_get_info_by_fd(&attr, uattr);
  72. break;
  73. case BPF_RAW_TRACEPOINT_OPEN:
  74. err = bpf_raw_tracepoint_open(&attr);
  75. break;
  76. case BPF_BTF_LOAD:
  77. err = bpf_btf_load(&attr);
  78. break;
  79. case BPF_BTF_GET_FD_BY_ID:
  80. err = bpf_btf_get_fd_by_id(&attr);
  81. break;
  82. case BPF_TASK_FD_QUERY:
  83. err = bpf_task_fd_query(&attr, uattr);
  84. break;
  85. default:
  86. err = -EINVAL;
  87. break;
  88. }
  89. return err;
  90. }

2.2 bpf 加载

BPF_PROG_LOAD命令负责加载一段BPF程序到内核当中

  • 拷贝程序到内核;
  • 校验它的安全性;
  • 如果可能对它进行JIT编译;
  • 然后分配一个文件句柄fd给它。

完成这一切后,后续再把这段BPF程序挂载到需要运行的钩子上面

目前支持的prog类型

  1. enum bpf_prog_type {
  2. BPF_PROG_TYPE_UNSPEC,
  3. BPF_PROG_TYPE_SOCKET_FILTER,
  4. BPF_PROG_TYPE_KPROBE,
  5. BPF_PROG_TYPE_SCHED_CLS,
  6. BPF_PROG_TYPE_SCHED_ACT,
  7. BPF_PROG_TYPE_TRACEPOINT,
  8. BPF_PROG_TYPE_XDP,
  9. BPF_PROG_TYPE_PERF_EVENT,
  10. BPF_PROG_TYPE_CGROUP_SKB,
  11. BPF_PROG_TYPE_CGROUP_SOCK,
  12. BPF_PROG_TYPE_LWT_IN,
  13. BPF_PROG_TYPE_LWT_OUT,
  14. BPF_PROG_TYPE_LWT_XMIT,
  15. BPF_PROG_TYPE_SOCK_OPS,
  16. BPF_PROG_TYPE_SK_SKB,
  17. BPF_PROG_TYPE_CGROUP_DEVICE,
  18. BPF_PROG_TYPE_SK_MSG,
  19. BPF_PROG_TYPE_RAW_TRACEPOINT,
  20. BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
  21. BPF_PROG_TYPE_LWT_SEG6LOCAL,
  22. BPF_PROG_TYPE_LIRC_MODE2,
  23. BPF_PROG_TYPE_SK_REUSEPORT,
  24. };

2.2.1 prog 加载

  1. static int bpf_prog_load(union bpf_attr *attr)
  2. {
  3. enum bpf_prog_type type = attr->prog_type;
  4. struct bpf_prog *prog;
  5. int err;
  6. char license[128];
  7. bool is_gpl;
  8. if (CHECK_ATTR(BPF_PROG_LOAD))
  9. return -EINVAL;
  10. if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
  11. return -EINVAL;
  12. /* copy eBPF program license from user space */
  13. /* (1.1) 根据attr->license地址,从用户空间拷贝license字符串到内核 */
  14. if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
  15. sizeof(license) - 1) < 0)
  16. return -EFAULT;
  17. license[sizeof(license) - 1] = 0;
  18. /* eBPF programs must be GPL compatible to use GPL-ed functions */
  19. /* (1.2) 判断license是否符合GPL协议 */
  20. is_gpl = license_is_gpl_compatible(license);
  21. /* (1.3) 判断BPF的总指令数是否超过BPF_MAXINSNS(4k),在5.8版本有修改,指令数量扩大到1M */
  22. if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
  23. return -E2BIG;
  24. if (type == BPF_PROG_TYPE_KPROBE &&
  25. attr->kern_version != LINUX_VERSION_CODE)
  26. return -EINVAL;
  27. if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
  28. type != BPF_PROG_TYPE_CGROUP_SKB &&
  29. !capable(CAP_SYS_ADMIN))
  30. return -EPERM;
  31. bpf_prog_load_fixup_attach_type(attr);
  32. if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type))
  33. return -EINVAL;
  34. /* plain bpf_prog allocation */
  35. prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
  36. if (!prog)
  37. return -ENOMEM;
  38. prog->expected_attach_type = attr->expected_attach_type;
  39. prog->aux->offload_requested = !!attr->prog_ifindex;
  40. err = security_bpf_prog_alloc(prog->aux);
  41. if (err)
  42. goto free_prog_nouncharge;
  43. err = bpf_prog_charge_memlock(prog);
  44. if (err)
  45. goto free_prog_sec;
  46. prog->len = attr->insn_cnt;
  47. err = -EFAULT;
  48. if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
  49. bpf_prog_insn_size(prog)) != 0)
  50. goto free_prog;
  51. prog->orig_prog = NULL;
  52. prog->jited = 0;
  53. atomic_set(&prog->aux->refcnt, 1);
  54. prog->gpl_compatible = is_gpl ? 1 : 0;
  55. if (bpf_prog_is_dev_bound(prog->aux)) {
  56. err = bpf_prog_offload_init(prog, attr);
  57. if (err)
  58. goto free_prog;
  59. }
  60. /* find program type: socket_filter vs tracing_filter */
  61. err = find_prog_type(type, prog);
  62. if (err < 0)
  63. goto free_prog;
  64. prog->aux->load_time = ktime_get_boot_ns();
  65. err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
  66. if (err)
  67. goto free_prog;
  68. /* run eBPF verifier */
  69. /* (3) 使用verifer对BPF程序进行合法性扫描 */
  70. err = bpf_check(&prog, attr);
  71. if (err < 0)
  72. goto free_used_maps;
  73. /* (4) 尝试对BPF程序进行JIT转换 */
  74. prog = bpf_prog_select_runtime(prog, &err);
  75. if (err < 0)
  76. goto free_used_maps;
  77. err = bpf_prog_alloc_id(prog);
  78. if (err)
  79. goto free_used_maps;
  80. /* Upon success of bpf_prog_alloc_id(), the BPF prog is
  81. * effectively publicly exposed. However, retrieving via
  82. * bpf_prog_get_fd_by_id() will take another reference,
  83. * therefore it cannot be gone underneath us.
  84. *
  85. * Only for the time /after/ successful bpf_prog_new_fd()
  86. * and before returning to userspace, we might just hold
  87. * one reference and any parallel close on that fd could
  88. * rip everything out. Hence, below notifications must
  89. * happen before bpf_prog_new_fd().
  90. *
  91. * Also, any failure handling from this point onwards must
  92. * be using bpf_prog_put() given the program is exposed.
  93. */
  94. bpf_prog_kallsyms_add(prog);
  95. /* (5) 给BPF程序分配一个文件句柄fd */
  96. err = bpf_prog_new_fd(prog);
  97. if (err < 0)
  98. bpf_prog_put(prog);
  99. return err; /* (6) 返回fd句柄 */
  100. free_used_maps:
  101. bpf_prog_kallsyms_del_subprogs(prog);
  102. free_used_maps(prog->aux);
  103. free_prog:
  104. bpf_prog_uncharge_memlock(prog);
  105. free_prog_sec:
  106. security_bpf_prog_free(prog->aux);
  107. free_prog_nouncharge:
  108. bpf_prog_free(prog);
  109. return err;
  110. }
  1. 1.2 申请 bpf_prog 结构
  2. struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
  3. {
  4. gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
  5. struct bpf_prog_aux *aux;
  6. struct bpf_prog *fp;
  7. size = round_up(size, PAGE_SIZE);
  8. fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
  9. if (fp == NULL)
  10. return NULL;
  11. aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
  12. if (aux == NULL) {
  13. vfree(fp);
  14. return NULL;
  15. }
  16. fp->pages = size / PAGE_SIZE;
  17. fp->aux = aux;
  18. fp->aux->prog = fp;
  19. (1.1) 是否打开JIT
  20. fp->jit_requested = ebpf_jit_enabled();
  21. INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
  22. return fp;
  23. }
  1. 1.3 选择运行时
  2. struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
  3. {
  4. /* In case of BPF to BPF calls, verifier did all the prep
  5. * work with regards to JITing, etc.
  6. */
  7. if (fp->bpf_func)
  8. goto finalize;
  9. (1.1) 注册执行函数,在不支持JIT时执行函数,默认是__bpf_prog_run
  10. bpf_prog_select_func(fp);
  11. /* eBPF JITs can rewrite the program in case constant
  12. * blinding is active. However, in case of error during
  13. * blinding, bpf_int_jit_compile() must always return a
  14. * valid program, which in this case would simply not
  15. * be JITed, but falls back to the interpreter.
  16. */
  17. if (!bpf_prog_is_dev_bound(fp->aux)) {
  18. (1.2) 开始JIT编译,重新注册执行函数入口
  19. fp = bpf_int_jit_compile(fp);
  20. #ifdef CONFIG_BPF_JIT_ALWAYS_ON
  21. if (!fp->jited) {
  22. *err = -ENOTSUPP;
  23. return fp;
  24. }
  25. #endif
  26. } else {
  27. *err = bpf_prog_offload_compile(fp);
  28. if (*err)
  29. return fp;
  30. }
  31. finalize:
  32. bpf_prog_lock_ro(fp);
  33. /* The tail call compatibility check can only be done at
  34. * this late stage as we need to determine, if we deal
  35. * with JITed or non JITed program concatenations and not
  36. * all eBPF JITs might immediately support all features.
  37. */
  38. *err = bpf_check_tail_call(fp);
  39. return fp;
  40. }
  41. 不论是转换成JIT的映像,或者是使用interpreter解释器。最后BPF程序运行的时候都是使用BPF_PROG_RUN()这个宏来调用的
  42. ret = BPF_PROG_RUN(prog, ctx);
  43. #define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi)

BPF来说有个重要的数据结构就是struct bpf_prog

  1. struct bpf_prog {
  2. u16 pages; /* Number of allocated pages */
  3. kmemcheck_bitfield_begin(meta);
  4. u16 jited:1, /* Is our filter JIT'ed? */
  5. gpl_compatible:1, /* Is filter GPL compatible? */
  6. cb_access:1, /* Is control block accessed? */
  7. dst_needed:1; /* Do we need dst entry? */
  8. kmemcheck_bitfield_end(meta);
  9. u32 len; /* Number of filter blocks */
  10. enum bpf_prog_type type; /* Type of BPF program */
  11. struct bpf_prog_aux *aux; /* Auxiliary fields */
  12. struct sock_fprog_kern *orig_prog; /* Original BPF program */
  13. unsigned int (*bpf_func)(const struct sk_buff *skb,
  14. const struct bpf_insn *filter);
  15. /* Instructions for interpreter */
  16. union {
  17. struct sock_filter insns[0];
  18. struct bpf_insn insnsi[0];
  19. };
  20. };

其中重要的成员如下:

  • len:程序包含bpf指令的数量;
  • type:当前bpf程序的类型
  • aux:主要用来辅助verifier校验和转换的数据;
  • orig_prog:
  • bpf_func:运行时BPF程序的入口。如果JIT转换成功,这里指向的就是BPF程序JIT转换后的映像;否则这里指向内核解析器(interpreter)的通用入口__bpf_prog_run();
  • insnsi[]:从用户态拷贝过来的,BPF程序原始指令的存放空间;

2.3 bpf map 操作

BPF map的应用场景有几种:

  • BPF程序和用户态态的交互:BPF程序运行完,得到的结果存储到map中,供用户态访问;
  • BPF程序内部交互:如果BPF程序内部需要用全局变量来交互,但是由于安全原因BPF程序不允许访问全局变量,可以使用map来充当全局变量;
  • BPF Tail call:Tail call是一个BPF程序跳转到另一BPF程序,BPF程序首先通过BPF_MAP_TYPE_PROG_ARRAY类型的map来知道另一个BPF程序的指针,然后调用tail_call()的helper function来执行Tail call。
  • BPF程序和内核态的交互:和BPF程序以外的内核程序交互,也可以使用map作为中介;

目前支持的BPF MAP类型

  1. enum bpf_map_type {
  2. BPF_MAP_TYPE_UNSPEC,
  3. BPF_MAP_TYPE_HASH,
  4. BPF_MAP_TYPE_ARRAY,
  5. BPF_MAP_TYPE_PROG_ARRAY,
  6. BPF_MAP_TYPE_PERF_EVENT_ARRAY,
  7. BPF_MAP_TYPE_PERCPU_HASH,
  8. BPF_MAP_TYPE_PERCPU_ARRAY,
  9. BPF_MAP_TYPE_STACK_TRACE,
  10. BPF_MAP_TYPE_CGROUP_ARRAY,
  11. BPF_MAP_TYPE_LRU_HASH,
  12. BPF_MAP_TYPE_LRU_PERCPU_HASH,
  13. BPF_MAP_TYPE_LPM_TRIE,
  14. BPF_MAP_TYPE_ARRAY_OF_MAPS,
  15. BPF_MAP_TYPE_HASH_OF_MAPS,
  16. BPF_MAP_TYPE_DEVMAP,
  17. BPF_MAP_TYPE_SOCKMAP,
  18. BPF_MAP_TYPE_CPUMAP,
  19. BPF_MAP_TYPE_XSKMAP,
  20. BPF_MAP_TYPE_SOCKHASH,
  21. BPF_MAP_TYPE_CGROUP_STORAGE,
  22. BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
  23. BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
  24. BPF_MAP_TYPE_QUEUE,
  25. BPF_MAP_TYPE_STACK,
  26. BPF_MAP_TYPE_SK_STORAGE,
  27. BPF_MAP_TYPE_DEVMAP_HASH,
  28. };

不论哪种map,对map的使用都是用"键-值“对(key-value)的形式来使用的

2.3.1 map 创建

用户态的loader在加载BPF程序的时候,首先会根据__section(“maps”)中的成员来调用bpf()系统调用来创建map对象。

  1. /* called via syscall */
  2. static int map_create(union bpf_attr *attr)
  3. {
  4. int numa_node = bpf_map_attr_numa_node(attr);
  5. struct bpf_map *map;
  6. int f_flags;
  7. int err;
  8. err = CHECK_ATTR(BPF_MAP_CREATE);
  9. if (err)
  10. return -EINVAL;
  11. f_flags = bpf_get_file_flag(attr->map_flags);
  12. if (f_flags < 0)
  13. return f_flags;
  14. if (numa_node != NUMA_NO_NODE &&
  15. ((unsigned int)numa_node >= nr_node_ids ||
  16. !node_online(numa_node)))
  17. return -EINVAL;
  18. /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
  19. /* (1) 根据map的类型分配空间 */
  20. map = find_and_alloc_map(attr);
  21. if (IS_ERR(map))
  22. return PTR_ERR(map);
  23. err = bpf_obj_name_cpy(map->name, attr->map_name);
  24. if (err)
  25. goto free_map_nouncharge;
  26. atomic_set(&map->refcnt, 1);
  27. atomic_set(&map->usercnt, 1);
  28. if (attr->btf_key_type_id || attr->btf_value_type_id) {
  29. struct btf *btf;
  30. if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
  31. err = -EINVAL;
  32. goto free_map_nouncharge;
  33. }
  34. btf = btf_get_by_fd(attr->btf_fd);
  35. if (IS_ERR(btf)) {
  36. err = PTR_ERR(btf);
  37. goto free_map_nouncharge;
  38. }
  39. err = map_check_btf(map, btf, attr->btf_key_type_id,
  40. attr->btf_value_type_id);
  41. if (err) {
  42. btf_put(btf);
  43. goto free_map_nouncharge;
  44. }
  45. map->btf = btf;
  46. map->btf_key_type_id = attr->btf_key_type_id;
  47. map->btf_value_type_id = attr->btf_value_type_id;
  48. }
  49. err = security_bpf_map_alloc(map);
  50. if (err)
  51. goto free_map_nouncharge;
  52. /* (2) 在进程vm中给map锁定空间 */
  53. err = bpf_map_init_memlock(map);
  54. if (err)
  55. goto free_map_sec;
  56. err = bpf_map_alloc_id(map);
  57. if (err)
  58. goto free_map;
  59. /* (3) 给map分配对应的文件句柄 */
  60. err = bpf_map_new_fd(map, f_flags);
  61. if (err < 0) {
  62. /* failed to allocate fd.
  63. * bpf_map_put_with_uref() is needed because the above
  64. * bpf_map_alloc_id() has published the map
  65. * to the userspace and the userspace may
  66. * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
  67. */
  68. bpf_map_put_with_uref(map);
  69. return err;
  70. }
  71. return err;
  72. free_map:
  73. bpf_map_release_memlock(map);
  74. free_map_sec:
  75. security_bpf_map_free(map);
  76. free_map_nouncharge:
  77. btf_put(map->btf);
  78. map->ops->map_free(map);
  79. return err;
  80. }

以BPF_MAP_TYPE_ARRAY类型的map为例,来看看map的分配过程:

从用户态传过来的attr成员意义如下:
attr->map_type:map的类型;
attr->key_size:键key成员的大小;
attr->value_size:值value成员的大小;
attr->max_entries:需要存储多少个条目("键-值“对)

array_map.c文件中定义了各种类型的map操作集合,以BPF_MAP_TYPE_ARRAY类型为例进行说明

BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)

const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_gen_lookup = array_map_gen_lookup,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
};

2.3.2 map 查找

查找就是通过key来找到对应的value

  1. static int map_lookup_elem(union bpf_attr *attr)
  2. {
  3.     void __user *ukey = u64_to_user_ptr(attr->key);
  4.     void __user *uvalue = u64_to_user_ptr(attr->value);
  5.     int ufd = attr->map_fd;
  6.     struct bpf_map *map;
  7.     void *key, *value, *ptr;
  8.     u32 value_size;
  9.     struct fd f;
  10.     int err;
  11.     if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
  12.         return -EINVAL;
  13.     f = fdget(ufd);
  14.     map = __bpf_map_get(f);
  15.     if (IS_ERR(map))
  16.         return PTR_ERR(map);
  17.     if (!(f.file->f_mode & FMODE_CAN_READ)) {
  18.         err = -EPERM;
  19.         goto err_put;
  20.     }
  21.     key = memdup_user(ukey, map->key_size);
  22.     if (IS_ERR(key)) {
  23.         err = PTR_ERR(key);
  24.         goto err_put;
  25.     }
  26.     if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
  27.         map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
  28.         map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
  29.         value_size = round_up(map->value_size, 8) * num_possible_cpus();
  30.     else if (IS_FD_MAP(map))
  31.         value_size = sizeof(u32);
  32.     else
  33.         value_size = map->value_size;
  34.     err = -ENOMEM;
  35.     value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
  36.     if (!value)
  37.         goto free_key;
  38.     if (bpf_map_is_dev_bound(map)) {
  39.         err = bpf_map_offload_lookup_elem(map, key, value);
  40.         goto done;
  41.     }
  42.     preempt_disable();
  43.     this_cpu_inc(bpf_prog_active);
  44.     if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
  45.         map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
  46.         err = bpf_percpu_hash_copy(map, key, value);
  47.     } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
  48.         err = bpf_percpu_array_copy(map, key, value);
  49.     } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
  50.         err = bpf_stackmap_copy(map, key, value);
  51.     } else if (IS_FD_ARRAY(map)) {
  52.         err = bpf_fd_array_map_lookup_elem(map, key, value);
  53.     } else if (IS_FD_HASH(map)) {
  54.         err = bpf_fd_htab_map_lookup_elem(map, key, value);
  55.     } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
  56.         err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
  57.     } else {
  58.         rcu_read_lock();
  59.         if (map->ops->map_lookup_elem_sys_only)
  60.             ptr = map->ops->map_lookup_elem_sys_only(map, key);
  61.         else
  62.             ptr = map->ops->map_lookup_elem(map, key);
  63.         if (ptr)
  64.             memcpy(value, ptr, value_size);
  65.         rcu_read_unlock();
  66.         err = ptr ? 0 : -ENOENT;
  67.     }
  68.     this_cpu_dec(bpf_prog_active);
  69.     preempt_enable();
  70. done:
  71.     if (err)
  72.         goto free_value;
  73.     err = -EFAULT;
  74.     if (copy_to_user(uvalue, value, value_size) != 0)
  75.         goto free_value;
  76.     err = 0;
  77. free_value:
  78.     kfree(value);
  79. free_key:
  80.     kfree(key);
  81. err_put:
  82.     fdput(f);
  83.     return err;
  84. }

针对每种map类型调用其对应的ops注册的查询操作,BPF_MAP_TYPE_ARRAY类型的map最终调用到array_map_lookup_elem():

  1. /* Called from syscall or from eBPF program */
  2. static void *array_map_lookup_elem(struct bpf_map *map, void *key)
  3. {
  4.     struct bpf_array *array = container_of(map, struct bpf_array, map);
  5.     u32 index = *(u32 *)key;
  6.     if (unlikely(index >= array->map.max_entries))
  7.         return NULL;
  8.     return array->value + array->elem_size * (index & array->index_mask);
  9. }

2.3.3 BPF_FUNC_map_lookup_elem

除了用户态空间需要通过bpf()系统调用来查找key对应的value值。BPF程序中也需要根据key查找到value的地址,然后在BPF程序中使用。BPF程序时通过调用BPF_FUNC_map_lookup_elem helper function来实现的。

  1. static const struct bpf_func_proto *
  2. bpf_base_func_proto(enum bpf_func_id func_id)
  3. {
  4.     switch (func_id) {
  5.     case BPF_FUNC_map_lookup_elem:
  6.         return &bpf_map_lookup_elem_proto;
  7.     case BPF_FUNC_map_update_elem:
  8.         return &bpf_map_update_elem_proto;
  9.     case BPF_FUNC_map_delete_elem:
  10.         return &bpf_map_delete_elem_proto;
  11.     case BPF_FUNC_get_prandom_u32:
  12.         return &bpf_get_prandom_u32_proto;
  13.     case BPF_FUNC_get_smp_processor_id:
  14.         return &bpf_get_raw_smp_processor_id_proto;
  15.     case BPF_FUNC_get_numa_node_id:
  16.         return &bpf_get_numa_node_id_proto;
  17.     case BPF_FUNC_tail_call:
  18.         return &bpf_tail_call_proto;
  19.     case BPF_FUNC_ktime_get_ns:
  20.         return &bpf_ktime_get_ns_proto;
  21.     case BPF_FUNC_trace_printk:
  22.         if (capable(CAP_SYS_ADMIN))
  23.             return bpf_get_trace_printk_proto();
  24.         /* else: fall through */
  25.     default:
  26.         return NULL;
  27.     }
  28. }
  29.  
  30. BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
  31. {
  32.     WARN_ON_ONCE(!rcu_read_lock_held());
  33.     return (unsigned long) map->ops->map_lookup_elem(map, key);
  34. }
  35. const struct bpf_func_proto bpf_map_lookup_elem_proto = {
  36.     .func       = bpf_map_lookup_elem,
  37.     .gpl_only   = false,
  38.     .pkt_access = true,
  39.     .ret_type   = RET_PTR_TO_MAP_VALUE_OR_NULL,
  40.     .arg1_type  = ARG_CONST_MAP_PTR,
  41.     .arg2_type  = ARG_PTR_TO_MAP_KEY,
  42. };

和bpf()系统调用一样,最后调用的都是map->ops->map_lookup_elem()函数,只不过BPF程序需要返回的是value的指针,而bpf()系统调用需要返回的是value的值

2.4 obj pin(盯住对象)

BPF map 和程序作为内核资源只能通过文件描述符访问,其背后是内核中的匿名 inode。这带来了很多优点,但同时也有很多缺点:

优点包括:用户空间应用能够使用大部分文件描述符相关的 API,传递给 Unix socket 的文 件描述符是透明工作的等等。也有一系列的坏处:因为fd生存在进程空间的,其他进程不能访问,而且一旦本进程退出,这些对象都会处于失联状态无法访问。

因此,这给某些特定的场景带来了很多复杂性,例如 iproute2,其中的 tc 或 XDP 在准备 环境、加载程序到内核之后最终会退出。在这种情况下,从用户空间也无法访问这些 map 了,而本来这些 map 其实是很有用的,例如,在 data path 的 ingress 和 egress 位置共 享的 map(可以统计包数、字节数、PPS 等信息)。另外,第三方应用可能希望在 BPF 程 序运行时监控或更新 map。

为了解决这个问题,内核实现了一个最小内核空间 BPF 文件系统,BPF map 和 BPF 程序 都可以钉到(pin)这个文件系统内,这个过程称为 object pinning(钉住对象)。相应 地,BPF 系统调用进行了扩展,添加了两个新命令,分别用于钉住(BPF_OBJ_PIN)一个 对象和获取(BPF_OBJ_GET)一个被钉住的对象(pinned objects)

具体的做法是把这些对象绑定到一个专用的文件系统当中

# ls /sys/fs/bpf/

#

 2.4.1 bpf_obj_do_pin

  1. static int bpf_obj_pin(const union bpf_attr *attr)
  2. {
  3. if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
  4. return -EINVAL;
  5. return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
  6. }
  7. int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
  8. {
  9. struct filename *pname;
  10. enum bpf_type type;
  11. void *raw;
  12. int ret;
  13. /* (1) 根据字符串获取路径 */
  14. pname = getname(pathname);
  15. if (IS_ERR(pname))
  16. return PTR_ERR(pname);
  17. /* (2) 根据fd获取到bpf_map/bpf_prog对象 */
  18. raw = bpf_fd_probe_obj(ufd, &type);
  19. if (IS_ERR(raw)) {
  20. ret = PTR_ERR(raw);
  21. goto out;
  22. }
  23. /* (3) 创建文件节点,和bpf对象联结起来 */
  24. ret = bpf_obj_do_pin(pname, raw, type);
  25. if (ret != 0)
  26. bpf_any_put(raw, type);
  27. out:
  28. putname(pname);
  29. return ret;
  30. }
  31. static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
  32. enum bpf_type type)
  33. {
  34. struct dentry *dentry;
  35. struct inode *dir;
  36. struct path path;
  37. umode_t mode;
  38. int ret;
  39. dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
  40. if (IS_ERR(dentry))
  41. return PTR_ERR(dentry);
  42. mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
  43. ret = security_path_mknod(&path, dentry, mode, 0);
  44. if (ret)
  45. goto out;
  46. dir = d_inode(path.dentry);
  47. if (dir->i_op != &bpf_dir_iops) {
  48. ret = -EPERM;
  49. goto out;
  50. }
  51. switch (type) {
  52. case BPF_TYPE_PROG:
  53. ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
  54. break;
  55. case BPF_TYPE_MAP:
  56. ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
  57. break;
  58. default:
  59. ret = -EPERM;
  60. }
  61. out:
  62. done_path_create(&path, dentry);
  63. return ret;
  64. }

把map对应的句柄fd与文件关联起来,通过文件就可以访问到具体的fd。

 2.4.2 bpf_obj_get

通过文件名称获取本进程中的句柄fd

  1. static int bpf_obj_get(const union bpf_attr *attr)
  2. {
  3.     if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
  4.         attr->file_flags & ~BPF_OBJ_FLAG_MASK)
  5.         return -EINVAL;
  6.     return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
  7.                 attr->file_flags);
  8. }
  9.  
  10. int bpf_obj_get_user(const char __user *pathname, int flags)
  11. {
  12.     enum bpf_type type = BPF_TYPE_UNSPEC;
  13.     struct filename *pname;
  14.     int ret = -ENOENT;
  15.     int f_flags;
  16.     void *raw;
  17.     f_flags = bpf_get_file_flag(flags);
  18.     if (f_flags < 0)
  19.         return f_flags;
  20.     /* (1) 根据字符串获取路径 */
  21.     pname = getname(pathname);
  22.     if (IS_ERR(pname))
  23.         return PTR_ERR(pname);
  24.     /* (2) 根据路径,在对应inode中找到bpf对象的map指针和type */
  25.     raw = bpf_obj_do_get(pname, &type, f_flags);
  26.     if (IS_ERR(raw)) {
  27.         ret = PTR_ERR(raw);
  28.         goto out;
  29.     }
  30.  
  31.     /* (3) 根据对象type,在本进程中给bpf对象分配一个fd */
  32.     if (type == BPF_TYPE_PROG)
  33.         ret = bpf_prog_new_fd(raw);
  34.     else if (type == BPF_TYPE_MAP)
  35.         ret = bpf_map_new_fd(raw, f_flags);
  36.     else
  37.         goto out;
  38.     if (ret < 0)
  39.         bpf_any_put(raw, type);
  40. out:
  41.     putname(pname);
  42.     return ret;
  43. }

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/786468
推荐阅读
相关标签
  

闽ICP备14008679号