| /* | 
 |  * eBPF RSS program | 
 |  * | 
 |  * Developed by Daynix Computing LTD (http://www.daynix.com) | 
 |  * | 
 |  * Authors: | 
 |  *  Andrew Melnychenko <andrew@daynix.com> | 
 |  *  Yuri Benditovich <yuri.benditovich@daynix.com> | 
 |  * | 
 |  * This work is licensed under the terms of the GNU GPL, version 2.  See | 
 |  * the COPYING file in the top-level directory. | 
 |  * | 
 |  * Prepare: | 
 |  * Requires llvm, clang, bpftool, linux kernel tree | 
 |  * | 
 |  * Build rss.bpf.skeleton.h: | 
 |  * make -f Makefile.ebpf clean all | 
 |  */ | 
 |  | 
 | #include <stddef.h> | 
 | #include <stdbool.h> | 
 | #include <linux/bpf.h> | 
 |  | 
 | #include <linux/in.h> | 
 | #include <linux/if_ether.h> | 
 | #include <linux/ip.h> | 
 | #include <linux/ipv6.h> | 
 |  | 
 | #include <linux/udp.h> | 
 | #include <linux/tcp.h> | 
 |  | 
 | #include <bpf/bpf_helpers.h> | 
 | #include <bpf/bpf_endian.h> | 
 | #include <linux/virtio_net.h> | 
 |  | 
 | #define INDIRECTION_TABLE_SIZE 128 | 
 | #define HASH_CALCULATION_BUFFER_SIZE 36 | 
 |  | 
 | struct rss_config_t { | 
 |     __u8 redirect; | 
 |     __u8 populate_hash; | 
 |     __u32 hash_types; | 
 |     __u16 indirections_len; | 
 |     __u16 default_queue; | 
 | } __attribute__((packed)); | 
 |  | 
 | struct toeplitz_key_data_t { | 
 |     __u32 leftmost_32_bits; | 
 |     __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE]; | 
 | }; | 
 |  | 
 | struct packet_hash_info_t { | 
 |     __u8 is_ipv4; | 
 |     __u8 is_ipv6; | 
 |     __u8 is_udp; | 
 |     __u8 is_tcp; | 
 |     __u8 is_ipv6_ext_src; | 
 |     __u8 is_ipv6_ext_dst; | 
 |     __u8 is_fragmented; | 
 |  | 
 |     __u16 src_port; | 
 |     __u16 dst_port; | 
 |  | 
 |     union { | 
 |         struct { | 
 |             __be32 in_src; | 
 |             __be32 in_dst; | 
 |         }; | 
 |  | 
 |         struct { | 
 |             struct in6_addr in6_src; | 
 |             struct in6_addr in6_dst; | 
 |             struct in6_addr in6_ext_src; | 
 |             struct in6_addr in6_ext_dst; | 
 |         }; | 
 |     }; | 
 | }; | 
 |  | 
 | struct { | 
 |     __uint(type, BPF_MAP_TYPE_ARRAY); | 
 |     __uint(key_size, sizeof(__u32)); | 
 |     __uint(value_size, sizeof(struct rss_config_t)); | 
 |     __uint(max_entries, 1); | 
 |     __uint(map_flags, BPF_F_MMAPABLE); | 
 | } tap_rss_map_configurations SEC(".maps"); | 
 |  | 
 | struct { | 
 |     __uint(type, BPF_MAP_TYPE_ARRAY); | 
 |     __uint(key_size, sizeof(__u32)); | 
 |     __uint(value_size, sizeof(struct toeplitz_key_data_t)); | 
 |     __uint(max_entries, 1); | 
 |     __uint(map_flags, BPF_F_MMAPABLE); | 
 | } tap_rss_map_toeplitz_key SEC(".maps"); | 
 |  | 
 | struct { | 
 |     __uint(type, BPF_MAP_TYPE_ARRAY); | 
 |     __uint(key_size, sizeof(__u32)); | 
 |     __uint(value_size, sizeof(__u16)); | 
 |     __uint(max_entries, INDIRECTION_TABLE_SIZE); | 
 |     __uint(map_flags, BPF_F_MMAPABLE); | 
 | } tap_rss_map_indirection_table SEC(".maps"); | 
 |  | 
 | static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written, | 
 |                                         const void *ptr, size_t size) { | 
 |     __builtin_memcpy(&rss_input[*bytes_written], ptr, size); | 
 |     *bytes_written += size; | 
 | } | 
 |  | 
 | static inline | 
 | void net_toeplitz_add(__u32 *result, | 
 |                       __u8 *input, | 
 |                       __u32 len | 
 |         , struct toeplitz_key_data_t *key) { | 
 |  | 
 |     __u32 accumulator = *result; | 
 |     __u32 leftmost_32_bits = key->leftmost_32_bits; | 
 |     __u32 byte; | 
 |  | 
 |     for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) { | 
 |         __u8 input_byte = input[byte]; | 
 |         __u8 key_byte = key->next_byte[byte]; | 
 |         __u8 bit; | 
 |  | 
 |         for (bit = 0; bit < 8; bit++) { | 
 |             if (input_byte & (1 << 7)) { | 
 |                 accumulator ^= leftmost_32_bits; | 
 |             } | 
 |  | 
 |             leftmost_32_bits = | 
 |                     (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7); | 
 |  | 
 |             input_byte <<= 1; | 
 |             key_byte <<= 1; | 
 |         } | 
 |     } | 
 |  | 
 |     *result = accumulator; | 
 | } | 
 |  | 
 |  | 
 | static inline int ip6_extension_header_type(__u8 hdr_type) | 
 | { | 
 |     switch (hdr_type) { | 
 |     case IPPROTO_HOPOPTS: | 
 |     case IPPROTO_ROUTING: | 
 |     case IPPROTO_FRAGMENT: | 
 |     case IPPROTO_ICMPV6: | 
 |     case IPPROTO_NONE: | 
 |     case IPPROTO_DSTOPTS: | 
 |     case IPPROTO_MH: | 
 |         return 1; | 
 |     default: | 
 |         return 0; | 
 |     } | 
 | } | 
 | /* | 
 |  * According to | 
 |  * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml | 
 |  * we expect that there are would be no more than 11 extensions in IPv6 header, | 
 |  * also there is 27 TLV options for Destination and Hop-by-hop extensions. | 
 |  * Need to choose reasonable amount of maximum extensions/options we may | 
 |  * check to find ext src/dst. | 
 |  */ | 
 | #define IP6_EXTENSIONS_COUNT 11 | 
 | #define IP6_OPTIONS_COUNT 30 | 
 |  | 
 | static inline int parse_ipv6_ext(struct __sk_buff *skb, | 
 |         struct packet_hash_info_t *info, | 
 |         __u8 *l4_protocol, size_t *l4_offset) | 
 | { | 
 |     int err = 0; | 
 |  | 
 |     if (!ip6_extension_header_type(*l4_protocol)) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     struct ipv6_opt_hdr ext_hdr = {}; | 
 |  | 
 |     for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) { | 
 |  | 
 |         err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr, | 
 |                                     sizeof(ext_hdr), BPF_HDR_START_NET); | 
 |         if (err) { | 
 |             goto error; | 
 |         } | 
 |  | 
 |         if (*l4_protocol == IPPROTO_ROUTING) { | 
 |             struct ipv6_rt_hdr ext_rt = {}; | 
 |  | 
 |             err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt, | 
 |                                         sizeof(ext_rt), BPF_HDR_START_NET); | 
 |             if (err) { | 
 |                 goto error; | 
 |             } | 
 |  | 
 |             if ((ext_rt.type == IPV6_SRCRT_TYPE_2) && | 
 |                     (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) && | 
 |                     (ext_rt.segments_left == 1)) { | 
 |  | 
 |                 err = bpf_skb_load_bytes_relative(skb, | 
 |                     *l4_offset + offsetof(struct rt2_hdr, addr), | 
 |                     &info->in6_ext_dst, sizeof(info->in6_ext_dst), | 
 |                     BPF_HDR_START_NET); | 
 |                 if (err) { | 
 |                     goto error; | 
 |                 } | 
 |  | 
 |                 info->is_ipv6_ext_dst = 1; | 
 |             } | 
 |  | 
 |         } else if (*l4_protocol == IPPROTO_DSTOPTS) { | 
 |             struct ipv6_opt_t { | 
 |                 __u8 type; | 
 |                 __u8 length; | 
 |             } __attribute__((packed)) opt = {}; | 
 |  | 
 |             size_t opt_offset = sizeof(ext_hdr); | 
 |  | 
 |             for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) { | 
 |                 err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset, | 
 |                                         &opt, sizeof(opt), BPF_HDR_START_NET); | 
 |                 if (err) { | 
 |                     goto error; | 
 |                 } | 
 |  | 
 |                 if (opt.type == IPV6_TLV_HAO) { | 
 |                     err = bpf_skb_load_bytes_relative(skb, | 
 |                         *l4_offset + opt_offset | 
 |                         + offsetof(struct ipv6_destopt_hao, addr), | 
 |                         &info->in6_ext_src, sizeof(info->in6_ext_src), | 
 |                         BPF_HDR_START_NET); | 
 |                     if (err) { | 
 |                         goto error; | 
 |                     } | 
 |  | 
 |                     info->is_ipv6_ext_src = 1; | 
 |                     break; | 
 |                 } | 
 |  | 
 |                 opt_offset += (opt.type == IPV6_TLV_PAD1) ? | 
 |                               1 : opt.length + sizeof(opt); | 
 |  | 
 |                 if (opt_offset + 1 >= ext_hdr.hdrlen * 8) { | 
 |                     break; | 
 |                 } | 
 |             } | 
 |         } else if (*l4_protocol == IPPROTO_FRAGMENT) { | 
 |             info->is_fragmented = true; | 
 |         } | 
 |  | 
 |         *l4_protocol = ext_hdr.nexthdr; | 
 |         *l4_offset += (ext_hdr.hdrlen + 1) * 8; | 
 |  | 
 |         if (!ip6_extension_header_type(ext_hdr.nexthdr)) { | 
 |             return 0; | 
 |         } | 
 |     } | 
 |  | 
 |     return 0; | 
 | error: | 
 |     return err; | 
 | } | 
 |  | 
 | static __be16 parse_eth_type(struct __sk_buff *skb) | 
 | { | 
 |     unsigned int offset = 12; | 
 |     __be16 ret = 0; | 
 |     int err = 0; | 
 |  | 
 |     err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret), | 
 |                                 BPF_HDR_START_MAC); | 
 |     if (err) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     switch (bpf_ntohs(ret)) { | 
 |     case ETH_P_8021AD: | 
 |         offset += 4; | 
 |     case ETH_P_8021Q: | 
 |         offset += 4; | 
 |         err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret), | 
 |                                     BPF_HDR_START_MAC); | 
 |     default: | 
 |         break; | 
 |     } | 
 |  | 
 |     if (err) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     return ret; | 
 | } | 
 |  | 
 | static inline int parse_packet(struct __sk_buff *skb, | 
 |         struct packet_hash_info_t *info) | 
 | { | 
 |     int err = 0; | 
 |  | 
 |     if (!info || !skb) { | 
 |         return -1; | 
 |     } | 
 |  | 
 |     size_t l4_offset = 0; | 
 |     __u8 l4_protocol = 0; | 
 |     __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb)); | 
 |     if (l3_protocol == 0) { | 
 |         err = -1; | 
 |         goto error; | 
 |     } | 
 |  | 
 |     if (l3_protocol == ETH_P_IP) { | 
 |         info->is_ipv4 = 1; | 
 |  | 
 |         struct iphdr ip = {}; | 
 |         err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip), | 
 |                                     BPF_HDR_START_NET); | 
 |         if (err) { | 
 |             goto error; | 
 |         } | 
 |  | 
 |         info->in_src = ip.saddr; | 
 |         info->in_dst = ip.daddr; | 
 |         info->is_fragmented = !!(bpf_ntohs(ip.frag_off) & (0x2000 | 0x1fff)); | 
 |  | 
 |         l4_protocol = ip.protocol; | 
 |         l4_offset = ip.ihl * 4; | 
 |     } else if (l3_protocol == ETH_P_IPV6) { | 
 |         info->is_ipv6 = 1; | 
 |  | 
 |         struct ipv6hdr ip6 = {}; | 
 |         err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6), | 
 |                                     BPF_HDR_START_NET); | 
 |         if (err) { | 
 |             goto error; | 
 |         } | 
 |  | 
 |         info->in6_src = ip6.saddr; | 
 |         info->in6_dst = ip6.daddr; | 
 |  | 
 |         l4_protocol = ip6.nexthdr; | 
 |         l4_offset = sizeof(ip6); | 
 |  | 
 |         err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset); | 
 |         if (err) { | 
 |             goto error; | 
 |         } | 
 |     } | 
 |  | 
 |     if (l4_protocol != 0 && !info->is_fragmented) { | 
 |         if (l4_protocol == IPPROTO_TCP) { | 
 |             info->is_tcp = 1; | 
 |  | 
 |             struct tcphdr tcp = {}; | 
 |             err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp), | 
 |                                         BPF_HDR_START_NET); | 
 |             if (err) { | 
 |                 goto error; | 
 |             } | 
 |  | 
 |             info->src_port = tcp.source; | 
 |             info->dst_port = tcp.dest; | 
 |         } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */ | 
 |             info->is_udp = 1; | 
 |  | 
 |             struct udphdr udp = {}; | 
 |             err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp), | 
 |                                         BPF_HDR_START_NET); | 
 |             if (err) { | 
 |                 goto error; | 
 |             } | 
 |  | 
 |             info->src_port = udp.source; | 
 |             info->dst_port = udp.dest; | 
 |         } | 
 |     } | 
 |  | 
 |     return 0; | 
 |  | 
 | error: | 
 |     return err; | 
 | } | 
 |  | 
 | static inline bool calculate_rss_hash(struct __sk_buff *skb, | 
 |                                       struct rss_config_t *config, | 
 |                                       struct toeplitz_key_data_t *toe, | 
 |                                       __u32 *result) | 
 | { | 
 |     __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {}; | 
 |     size_t bytes_written = 0; | 
 |     int err = 0; | 
 |     struct packet_hash_info_t packet_info = {}; | 
 |  | 
 |     err = parse_packet(skb, &packet_info); | 
 |     if (err) { | 
 |         return false; | 
 |     } | 
 |  | 
 |     if (packet_info.is_ipv4) { | 
 |         if (packet_info.is_tcp && | 
 |             config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) { | 
 |  | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.in_src, | 
 |                                  sizeof(packet_info.in_src)); | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.in_dst, | 
 |                                  sizeof(packet_info.in_dst)); | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.src_port, | 
 |                                  sizeof(packet_info.src_port)); | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.dst_port, | 
 |                                  sizeof(packet_info.dst_port)); | 
 |         } else if (packet_info.is_udp && | 
 |                    config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) { | 
 |  | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.in_src, | 
 |                                  sizeof(packet_info.in_src)); | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.in_dst, | 
 |                                  sizeof(packet_info.in_dst)); | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.src_port, | 
 |                                  sizeof(packet_info.src_port)); | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.dst_port, | 
 |                                  sizeof(packet_info.dst_port)); | 
 |         } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) { | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.in_src, | 
 |                                  sizeof(packet_info.in_src)); | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.in_dst, | 
 |                                  sizeof(packet_info.in_dst)); | 
 |         } | 
 |     } else if (packet_info.is_ipv6) { | 
 |         if (packet_info.is_tcp && | 
 |             config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) { | 
 |  | 
 |             if (packet_info.is_ipv6_ext_src && | 
 |                 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { | 
 |  | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_ext_src, | 
 |                                      sizeof(packet_info.in6_ext_src)); | 
 |             } else { | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_src, | 
 |                                      sizeof(packet_info.in6_src)); | 
 |             } | 
 |             if (packet_info.is_ipv6_ext_dst && | 
 |                 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { | 
 |  | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_ext_dst, | 
 |                                      sizeof(packet_info.in6_ext_dst)); | 
 |             } else { | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_dst, | 
 |                                      sizeof(packet_info.in6_dst)); | 
 |             } | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.src_port, | 
 |                                  sizeof(packet_info.src_port)); | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.dst_port, | 
 |                                  sizeof(packet_info.dst_port)); | 
 |         } else if (packet_info.is_udp && | 
 |                    config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) { | 
 |  | 
 |             if (packet_info.is_ipv6_ext_src && | 
 |                config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { | 
 |  | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_ext_src, | 
 |                                      sizeof(packet_info.in6_ext_src)); | 
 |             } else { | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_src, | 
 |                                      sizeof(packet_info.in6_src)); | 
 |             } | 
 |             if (packet_info.is_ipv6_ext_dst && | 
 |                config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { | 
 |  | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_ext_dst, | 
 |                                      sizeof(packet_info.in6_ext_dst)); | 
 |             } else { | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_dst, | 
 |                                      sizeof(packet_info.in6_dst)); | 
 |             } | 
 |  | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.src_port, | 
 |                                  sizeof(packet_info.src_port)); | 
 |             net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                  &packet_info.dst_port, | 
 |                                  sizeof(packet_info.dst_port)); | 
 |  | 
 |         } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) { | 
 |             if (packet_info.is_ipv6_ext_src && | 
 |                config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { | 
 |  | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_ext_src, | 
 |                                      sizeof(packet_info.in6_ext_src)); | 
 |             } else { | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_src, | 
 |                                      sizeof(packet_info.in6_src)); | 
 |             } | 
 |             if (packet_info.is_ipv6_ext_dst && | 
 |                 config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { | 
 |  | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_ext_dst, | 
 |                                      sizeof(packet_info.in6_ext_dst)); | 
 |             } else { | 
 |                 net_rx_rss_add_chunk(rss_input, &bytes_written, | 
 |                                      &packet_info.in6_dst, | 
 |                                      sizeof(packet_info.in6_dst)); | 
 |             } | 
 |         } | 
 |     } | 
 |  | 
 |     if (!bytes_written) { | 
 |         return false; | 
 |     } | 
 |  | 
 |     net_toeplitz_add(result, rss_input, bytes_written, toe); | 
 |  | 
 |     return true; | 
 | } | 
 |  | 
 | SEC("socket") | 
 | int tun_rss_steering_prog(struct __sk_buff *skb) | 
 | { | 
 |  | 
 |     struct rss_config_t *config; | 
 |     struct toeplitz_key_data_t *toe; | 
 |  | 
 |     __u32 key = 0; | 
 |     __u32 hash = 0; | 
 |  | 
 |     config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key); | 
 |     toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key); | 
 |  | 
 |     if (!config || !toe) { | 
 |         return 0; | 
 |     } | 
 |  | 
 |     if (config->redirect && calculate_rss_hash(skb, config, toe, &hash)) { | 
 |         __u32 table_idx = hash % config->indirections_len; | 
 |         __u16 *queue = 0; | 
 |  | 
 |         queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table, | 
 |                                     &table_idx); | 
 |  | 
 |         if (queue) { | 
 |             return *queue; | 
 |         } | 
 |     } | 
 |  | 
 |     return config->default_queue; | 
 | } | 
 |  | 
 | char _license[] SEC("license") = "GPL v2"; |