Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2018 Facebook */ #include <linux/bpf.h> #include <linux/err.h> #include <linux/sock_diag.h> #include <net/sock_reuseport.h> #include <linux/btf_ids.h> struct reuseport_array { struct bpf_map map; struct sock __rcu *ptrs[]; }; static struct reuseport_array *reuseport_array(struct bpf_map *map) { return (struct reuseport_array *)map; } /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { struct sock __rcu **socks; write_lock_bh(&sk->sk_callback_lock); socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); if (socks) { WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of * sk->sk_callback_lock because there is * a race with reuseport_array_free() * which does not hold the reuseport_lock. */ RCU_INIT_POINTER(*socks, NULL); } write_unlock_bh(&sk->sk_callback_lock); } static int reuseport_array_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) return -EINVAL; return array_map_alloc_check(attr); } static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return rcu_dereference(array->ptrs[index]); } /* Called from syscall only */ static int reuseport_array_delete_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; struct sock *sk; int err; if (index >= map->max_entries) return -E2BIG; if (!rcu_access_pointer(array->ptrs[index])) return -ENOENT; spin_lock_bh(&reuseport_lock); sk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); if (sk) { write_lock_bh(&sk->sk_callback_lock); WRITE_ONCE(sk->sk_user_data, NULL); RCU_INIT_POINTER(array->ptrs[index], NULL); write_unlock_bh(&sk->sk_callback_lock); err = 0; } else { err = -ENOENT; } spin_unlock_bh(&reuseport_lock); return err; } static void reuseport_array_free(struct bpf_map *map) { struct reuseport_array *array = reuseport_array(map); struct sock *sk; u32 i; /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with * bpf_sk_reuseport_detach() which was triggered by * close() or disconnect(). * * This function and bpf_sk_reuseport_detach() are * both removing sk from "array". Who removes it * first does not matter. * * The only concern here is bpf_sk_reuseport_detach() * may access "array" which is being freed here. * bpf_sk_reuseport_detach() access this "array" * through sk->sk_user_data _and_ with sk->sk_callback_lock * held which is enough because this "array" is not freed * until all sk->sk_user_data has stopped referencing this "array". * * Hence, due to the above, taking "reuseport_lock" is not * needed here. */ /* * Since reuseport_lock is not taken, sk is accessed under * rcu_read_lock() */ rcu_read_lock(); for (i = 0; i < map->max_entries; i++) { sk = rcu_dereference(array->ptrs[i]); if (sk) { write_lock_bh(&sk->sk_callback_lock); /* * No need for WRITE_ONCE(). At this point, * no one is reading it without taking the * sk->sk_callback_lock. */ sk->sk_user_data = NULL; write_unlock_bh(&sk->sk_callback_lock); RCU_INIT_POINTER(array->ptrs[i], NULL); } } rcu_read_unlock(); /* * Once reaching here, all sk->sk_user_data is not * referencing this "array". "array" can be freed now. */ bpf_map_area_free(array); } static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; if (!bpf_capable()) return ERR_PTR(-EPERM); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); return &array->map; } int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { struct sock *sk; int err; if (map->value_size != sizeof(u64)) return -ENOSPC; rcu_read_lock(); sk = reuseport_array_lookup_elem(map, key); if (sk) { *(u64 *)value = __sock_gen_cookie(sk); err = 0; } else { err = -ENOENT; } rcu_read_unlock(); return err; } static int reuseport_array_update_check(const struct reuseport_array *array, const struct sock *nsk, const struct sock *osk, const struct sock_reuseport *nsk_reuse, u32 map_flags) { if (osk && map_flags == BPF_NOEXIST) return -EEXIST; if (!osk && map_flags == BPF_EXIST) return -ENOENT; if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) return -ENOTSUPP; if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) return -ENOTSUPP; if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) return -ENOTSUPP; /* * sk must be hashed (i.e. listening in the TCP case or binded * in the UDP case) and * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). * * Also, sk will be used in bpf helper that is protected by * rcu_read_lock(). */ if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) return -EINVAL; /* READ_ONCE because the sk->sk_callback_lock may not be held here */ if (READ_ONCE(nsk->sk_user_data)) return -EBUSY; return 0; } /* * Called from syscall only. * The "nsk" in the fd refcnt. * The "osk" and "reuse" are protected by reuseport_lock. */ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct reuseport_array *array = reuseport_array(map); struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; uintptr_t sk_user_data; struct socket *socket; int err, fd; if (map_flags > BPF_EXIST) return -EINVAL; if (index >= map->max_entries) return -E2BIG; if (map->value_size == sizeof(u64)) { u64 fd64 = *(u64 *)value; if (fd64 > S32_MAX) return -EINVAL; fd = fd64; } else { fd = *(int *)value; } socket = sockfd_lookup(fd, &err); if (!socket) return err; nsk = socket->sk; if (!nsk) { err = -EINVAL; goto put_file; } /* Quick checks before taking reuseport_lock */ err = reuseport_array_update_check(array, nsk, rcu_access_pointer(array->ptrs[index]), rcu_access_pointer(nsk->sk_reuseport_cb), map_flags); if (err) goto put_file; spin_lock_bh(&reuseport_lock); /* * Some of the checks only need reuseport_lock * but it is done under sk_callback_lock also * for simplicity reason. */ write_lock_bh(&nsk->sk_callback_lock); osk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); if (err) goto put_file_unlock; sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF; WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; put_file_unlock: write_unlock_bh(&nsk->sk_callback_lock); if (free_osk) { write_lock_bh(&free_osk->sk_callback_lock); WRITE_ONCE(free_osk->sk_user_data, NULL); write_unlock_bh(&free_osk->sk_callback_lock); } spin_unlock_bh(&reuseport_lock); put_file: fput(socket->file); return err; } /* Called from syscall */ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct reuseport_array *array = reuseport_array(map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { *next = 0; return 0; } if (index == array->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, .map_free = reuseport_array_free, .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, .map_btf_id = &reuseport_array_map_btf_ids[0], }; |