| /* |
| * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>. |
| * |
| * This program is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU General Public License as |
| * published by the Free Software Foundation; either version 2 of the |
| * License, or any later version. |
| * |
| * This program is distributed in the hope that it will be useful, but |
| * WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
| * 02110-1301, USA. |
| * |
| * You can also choose to distribute this program under the terms of |
| * the Unmodified Binary Distribution Licence (as given in the file |
| * COPYING.UBDL), provided that you have satisfied its requirements. |
| */ |
| |
| FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); |
| |
| #include <stdint.h> |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <unistd.h> |
| #include <string.h> |
| #include <byteswap.h> |
| #include <errno.h> |
| #include <ipxe/errortab.h> |
| #include <ipxe/malloc.h> |
| #include <ipxe/if_arp.h> |
| #include <ipxe/arp.h> |
| #include <ipxe/if_ether.h> |
| #include <ipxe/ethernet.h> |
| #include <ipxe/ip.h> |
| #include <ipxe/iobuf.h> |
| #include <ipxe/netdevice.h> |
| #include <ipxe/infiniband.h> |
| #include <ipxe/ib_pathrec.h> |
| #include <ipxe/ib_mcast.h> |
| #include <ipxe/retry.h> |
| #include <ipxe/ipoib.h> |
| |
| /** @file |
| * |
| * IP over Infiniband |
| */ |
| |
| /* Disambiguate the various error causes */ |
| #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY ) |
| #define EINFO_ENXIO_ARP_REPLY \ |
| __einfo_uniqify ( EINFO_ENXIO, 0x01, \ |
| "Missing REMAC for ARP reply target address" ) |
| #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 ) |
| #define EINFO_ENXIO_NON_IPV4 \ |
| __einfo_uniqify ( EINFO_ENXIO, 0x02, \ |
| "Missing REMAC for non-IPv4 packet" ) |
| #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT ) |
| #define EINFO_ENXIO_ARP_SENT \ |
| __einfo_uniqify ( EINFO_ENXIO, 0x03, \ |
| "Missing REMAC for IPv4 packet (ARP sent)" ) |
| |
| /** Number of IPoIB send work queue entries */ |
| #define IPOIB_NUM_SEND_WQES 8 |
| |
| /** Number of IPoIB receive work queue entries */ |
| #define IPOIB_NUM_RECV_WQES 4 |
| |
| /** Number of IPoIB completion entries */ |
| #define IPOIB_NUM_CQES 16 |
| |
| /** An IPoIB broadcast address */ |
| struct ipoib_broadcast { |
| /** MAC address */ |
| struct ipoib_mac mac; |
| /** Address vector */ |
| struct ib_address_vector av; |
| /** Multicast group membership */ |
| struct ib_mc_membership membership; |
| }; |
| |
| /** An IPoIB device */ |
| struct ipoib_device { |
| /** Network device */ |
| struct net_device *netdev; |
| /** Underlying Infiniband device */ |
| struct ib_device *ibdev; |
| /** List of IPoIB devices */ |
| struct list_head list; |
| /** Completion queue */ |
| struct ib_completion_queue *cq; |
| /** Queue pair */ |
| struct ib_queue_pair *qp; |
| /** Local MAC */ |
| struct ipoib_mac mac; |
| /** Broadcast address */ |
| struct ipoib_broadcast broadcast; |
| /** REMAC cache */ |
| struct list_head peers; |
| }; |
| |
| /** Broadcast IPoIB address */ |
| static struct ipoib_mac ipoib_broadcast = { |
| .flags__qpn = htonl ( IB_QPN_BROADCAST ), |
| .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, |
| 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }, |
| }; |
| |
| /** Link status for "broadcast join in progress" */ |
| #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING ) |
| #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \ |
| ( EINFO_EINPROGRESS, 0x01, "Joining" ) |
| |
| /** Human-readable message for the link status */ |
| struct errortab ipoib_errors[] __errortab = { |
| __einfo_errortab ( EINFO_EINPROGRESS_JOINING ), |
| }; |
| |
| /** List of all IPoIB devices */ |
| static LIST_HEAD ( ipoib_devices ); |
| |
| static struct net_device_operations ipoib_operations; |
| |
| /**************************************************************************** |
| * |
| * IPoIB REMAC cache |
| * |
| **************************************************************************** |
| */ |
| |
| /** An IPoIB REMAC cache entry */ |
| struct ipoib_peer { |
| /** List of REMAC cache entries */ |
| struct list_head list; |
| /** Remote Ethermet MAC */ |
| struct ipoib_remac remac; |
| /** MAC address */ |
| struct ipoib_mac mac; |
| }; |
| |
| /** |
| * Find IPoIB MAC from REMAC |
| * |
| * @v ipoib IPoIB device |
| * @v remac Remote Ethernet MAC |
| * @ret mac IPoIB MAC (or NULL if not found) |
| */ |
| static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib, |
| const struct ipoib_remac *remac ) { |
| struct ipoib_peer *peer; |
| |
| /* Check for broadcast or multicast REMAC. We transmit |
| * multicasts as broadcasts for simplicity. |
| */ |
| if ( is_multicast_ether_addr ( remac ) ) |
| return &ipoib->broadcast.mac; |
| |
| /* Try to find via REMAC cache */ |
| list_for_each_entry ( peer, &ipoib->peers, list ) { |
| if ( memcmp ( remac, &peer->remac, |
| sizeof ( peer->remac ) ) == 0 ) { |
| /* Move peer to start of list */ |
| list_del ( &peer->list ); |
| list_add ( &peer->list, &ipoib->peers ); |
| return &peer->mac; |
| } |
| } |
| |
| DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n", |
| ipoib, eth_ntoa ( remac ) ); |
| return NULL; |
| } |
| |
| /** |
| * Add IPoIB MAC to REMAC cache |
| * |
| * @v ipoib IPoIB device |
| * @v remac Remote Ethernet MAC |
| * @v mac IPoIB MAC |
| * @ret rc Return status code |
| */ |
| static int ipoib_map_remac ( struct ipoib_device *ipoib, |
| const struct ipoib_remac *remac, |
| const struct ipoib_mac *mac ) { |
| struct ipoib_peer *peer; |
| |
| /* Check for existing entry in REMAC cache */ |
| list_for_each_entry ( peer, &ipoib->peers, list ) { |
| if ( memcmp ( remac, &peer->remac, |
| sizeof ( peer->remac ) ) == 0 ) { |
| /* Move peer to start of list */ |
| list_del ( &peer->list ); |
| list_add ( &peer->list, &ipoib->peers ); |
| /* Update MAC */ |
| memcpy ( &peer->mac, mac, sizeof ( peer->mac ) ); |
| return 0; |
| } |
| } |
| |
| /* Create new entry */ |
| peer = malloc ( sizeof ( *peer ) ); |
| if ( ! peer ) |
| return -ENOMEM; |
| memcpy ( &peer->remac, remac, sizeof ( peer->remac ) ); |
| memcpy ( &peer->mac, mac, sizeof ( peer->mac ) ); |
| list_add ( &peer->list, &ipoib->peers ); |
| |
| return 0; |
| } |
| |
| /** |
| * Flush REMAC cache |
| * |
| * @v ipoib IPoIB device |
| */ |
| static void ipoib_flush_remac ( struct ipoib_device *ipoib ) { |
| struct ipoib_peer *peer; |
| struct ipoib_peer *tmp; |
| |
| list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) { |
| list_del ( &peer->list ); |
| free ( peer ); |
| } |
| } |
| |
| /** |
| * Discard some entries from the REMAC cache |
| * |
| * @ret discarded Number of cached items discarded |
| */ |
| static unsigned int ipoib_discard_remac ( void ) { |
| struct net_device *netdev; |
| struct ipoib_device *ipoib; |
| struct ipoib_peer *peer; |
| unsigned int discarded = 0; |
| |
| /* Try to discard one cache entry for each IPoIB device */ |
| for_each_netdev ( netdev ) { |
| |
| /* Skip non-IPoIB devices */ |
| if ( netdev->op != &ipoib_operations ) |
| continue; |
| ipoib = netdev->priv; |
| |
| /* Discard least recently used cache entry (if any) */ |
| list_for_each_entry_reverse ( peer, &ipoib->peers, list ) { |
| list_del ( &peer->list ); |
| free ( peer ); |
| discarded++; |
| break; |
| } |
| } |
| |
| return discarded; |
| } |
| |
| /** IPoIB cache discarder */ |
| struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = { |
| .discard = ipoib_discard_remac, |
| }; |
| |
| /**************************************************************************** |
| * |
| * IPoIB link layer |
| * |
| **************************************************************************** |
| */ |
| |
| /** |
| * Initialise IPoIB link-layer address |
| * |
| * @v hw_addr Hardware address |
| * @v ll_addr Link-layer address |
| */ |
| static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) { |
| const uint8_t *guid = hw_addr; |
| uint8_t *eth_addr = ll_addr; |
| uint8_t guid_mask = IPOIB_GUID_MASK; |
| unsigned int i; |
| |
| /* Extract bytes from GUID according to mask */ |
| for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) { |
| if ( guid_mask & 0x80 ) |
| *(eth_addr++) = *guid; |
| } |
| } |
| |
| /** IPoIB protocol */ |
| struct ll_protocol ipoib_protocol __ll_protocol = { |
| .name = "IPoIB", |
| .ll_proto = htons ( ARPHRD_ETHER ), |
| .hw_addr_len = sizeof ( union ib_guid ), |
| .ll_addr_len = ETH_ALEN, |
| .ll_header_len = ETH_HLEN, |
| .push = eth_push, |
| .pull = eth_pull, |
| .init_addr = ipoib_init_addr, |
| .ntoa = eth_ntoa, |
| .mc_hash = eth_mc_hash, |
| .eth_addr = eth_eth_addr, |
| .eui64 = eth_eui64, |
| .flags = LL_NAME_ONLY, |
| }; |
| |
| /** |
| * Allocate IPoIB device |
| * |
| * @v priv_size Size of driver private data |
| * @ret netdev Network device, or NULL |
| */ |
| struct net_device * alloc_ipoibdev ( size_t priv_size ) { |
| struct net_device *netdev; |
| |
| netdev = alloc_netdev ( priv_size ); |
| if ( netdev ) { |
| netdev->ll_protocol = &ipoib_protocol; |
| netdev->ll_broadcast = eth_broadcast; |
| netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE; |
| } |
| return netdev; |
| } |
| |
| /**************************************************************************** |
| * |
| * IPoIB translation layer |
| * |
| **************************************************************************** |
| */ |
| |
| /** |
| * Translate transmitted ARP packet |
| * |
| * @v netdev Network device |
| * @v iobuf Packet to be transmitted (with no link-layer headers) |
| * @ret rc Return status code |
| */ |
| static int ipoib_translate_tx_arp ( struct net_device *netdev, |
| struct io_buffer *iobuf ) { |
| struct ipoib_device *ipoib = netdev->priv; |
| struct arphdr *arphdr = iobuf->data; |
| struct ipoib_mac *target_ha = NULL; |
| void *sender_pa; |
| void *target_pa; |
| |
| /* Do nothing unless ARP contains eIPoIB link-layer addresses */ |
| if ( arphdr->ar_hln != ETH_ALEN ) |
| return 0; |
| |
| /* Fail unless we have room to expand packet */ |
| if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) - |
| ETH_ALEN ) ) ) { |
| DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n", |
| ipoib ); |
| return -ENOBUFS; |
| } |
| |
| /* Look up REMAC, if applicable */ |
| if ( arphdr->ar_op == ARPOP_REPLY ) { |
| target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr )); |
| if ( ! target_ha ) { |
| DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n", |
| ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) ); |
| return -ENXIO_ARP_REPLY; |
| } |
| } |
| |
| /* Construct new packet */ |
| iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) ); |
| sender_pa = arp_sender_pa ( arphdr ); |
| target_pa = arp_target_pa ( arphdr ); |
| arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND ); |
| arphdr->ar_hln = sizeof ( ipoib->mac ); |
| memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln ); |
| memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln ); |
| memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) ); |
| memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) ); |
| if ( target_ha ) { |
| memcpy ( arp_target_ha ( arphdr ), target_ha, |
| sizeof ( *target_ha ) ); |
| } |
| |
| return 0; |
| } |
| |
| /** |
| * Translate transmitted packet |
| * |
| * @v netdev Network device |
| * @v iobuf Packet to be transmitted (with no link-layer headers) |
| * @v net_proto Network-layer protocol (in network byte order) |
| * @ret rc Return status code |
| */ |
| static int ipoib_translate_tx ( struct net_device *netdev, |
| struct io_buffer *iobuf, uint16_t net_proto ) { |
| |
| switch ( net_proto ) { |
| case htons ( ETH_P_ARP ) : |
| return ipoib_translate_tx_arp ( netdev, iobuf ); |
| case htons ( ETH_P_IP ) : |
| /* No translation needed */ |
| return 0; |
| default: |
| /* Cannot handle other traffic via eIPoIB */ |
| return -ENOTSUP; |
| } |
| } |
| |
| /** |
| * Translate received ARP packet |
| * |
| * @v netdev Network device |
| * @v iobuf Received packet (with no link-layer headers) |
| * @v remac Constructed Remote Ethernet MAC |
| * @ret rc Return status code |
| */ |
| static int ipoib_translate_rx_arp ( struct net_device *netdev, |
| struct io_buffer *iobuf, |
| struct ipoib_remac *remac ) { |
| struct ipoib_device *ipoib = netdev->priv; |
| struct arphdr *arphdr = iobuf->data; |
| void *sender_pa; |
| void *target_pa; |
| int rc; |
| |
| /* Do nothing unless ARP contains IPoIB link-layer addresses */ |
| if ( arphdr->ar_hln != sizeof ( ipoib->mac ) ) |
| return 0; |
| |
| /* Create REMAC cache entry */ |
| if ( ( rc = ipoib_map_remac ( ipoib, remac, |
| arp_sender_ha ( arphdr ) ) ) != 0 ) { |
| DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n", |
| ipoib, strerror ( rc ) ); |
| return rc; |
| } |
| |
| /* Construct new packet */ |
| sender_pa = arp_sender_pa ( arphdr ); |
| target_pa = arp_target_pa ( arphdr ); |
| arphdr->ar_hrd = htons ( ARPHRD_ETHER ); |
| arphdr->ar_hln = ETH_ALEN; |
| memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln ); |
| memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln ); |
| memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN ); |
| memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN ); |
| if ( arphdr->ar_op == ARPOP_REPLY ) { |
| /* Assume received replies were directed to us */ |
| memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN ); |
| } |
| iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) ); |
| |
| return 0; |
| } |
| |
| /** |
| * Translate received packet |
| * |
| * @v netdev Network device |
| * @v iobuf Received packet (with no link-layer headers) |
| * @v remac Constructed Remote Ethernet MAC |
| * @v net_proto Network-layer protocol (in network byte order) |
| * @ret rc Return status code |
| */ |
| static int ipoib_translate_rx ( struct net_device *netdev, |
| struct io_buffer *iobuf, |
| struct ipoib_remac *remac, |
| uint16_t net_proto ) { |
| |
| switch ( net_proto ) { |
| case htons ( ETH_P_ARP ) : |
| return ipoib_translate_rx_arp ( netdev, iobuf, remac ); |
| case htons ( ETH_P_IP ) : |
| /* No translation needed */ |
| return 0; |
| default: |
| /* Cannot handle other traffic via eIPoIB */ |
| return -ENOTSUP; |
| } |
| } |
| |
| /**************************************************************************** |
| * |
| * IPoIB network device |
| * |
| **************************************************************************** |
| */ |
| |
| /** |
| * Transmit packet via IPoIB network device |
| * |
| * @v netdev Network device |
| * @v iobuf I/O buffer |
| * @ret rc Return status code |
| */ |
| static int ipoib_transmit ( struct net_device *netdev, |
| struct io_buffer *iobuf ) { |
| struct ipoib_device *ipoib = netdev->priv; |
| struct ib_device *ibdev = ipoib->ibdev; |
| struct ethhdr *ethhdr; |
| struct iphdr *iphdr; |
| struct ipoib_hdr *ipoib_hdr; |
| struct ipoib_remac *remac; |
| struct ipoib_mac *mac; |
| struct ib_address_vector *dest; |
| struct ib_address_vector av; |
| uint16_t net_proto; |
| int rc; |
| |
| /* Sanity check */ |
| if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) { |
| DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib ); |
| return -EINVAL; |
| } |
| |
| /* Attempting transmission while link is down will put the |
| * queue pair into an error state, so don't try it. |
| */ |
| if ( ! ib_link_ok ( ibdev ) ) |
| return -ENETUNREACH; |
| |
| /* Strip eIPoIB header */ |
| ethhdr = iobuf->data; |
| remac = ( ( struct ipoib_remac * ) ethhdr->h_dest ); |
| net_proto = ethhdr->h_protocol; |
| iob_pull ( iobuf, sizeof ( *ethhdr ) ); |
| |
| /* Identify destination address */ |
| if ( is_multicast_ether_addr ( remac ) ) { |
| |
| /* Transmit multicasts as broadcasts, for simplicity */ |
| dest = &ipoib->broadcast.av; |
| |
| } else if ( ( mac = ipoib_find_remac ( ipoib, remac ) ) ) { |
| |
| /* Construct address vector from IPoIB MAC */ |
| dest = &av; |
| memset ( dest, 0, sizeof ( *dest ) ); |
| dest->qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK ); |
| dest->qkey = ipoib->broadcast.av.qkey; |
| dest->gid_present = 1; |
| memcpy ( &dest->gid, &mac->gid, sizeof ( dest->gid ) ); |
| if ( ( rc = ib_resolve_path ( ibdev, dest ) ) != 0 ) { |
| /* Path not resolved yet */ |
| return rc; |
| } |
| |
| } else { |
| |
| /* Generate a new ARP request (if possible) to trigger |
| * population of the REMAC cache entry. |
| */ |
| if ( ( net_proto != htons ( ETH_P_IP ) ) || |
| ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) { |
| DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 " |
| "packet type %04x\n", ipoib, |
| eth_ntoa ( ethhdr->h_dest ), |
| ntohs ( net_proto ) ); |
| return -ENXIO_NON_IPV4; |
| } |
| iphdr = iobuf->data; |
| if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol, |
| &iphdr->dest, &iphdr->src ) ) !=0){ |
| DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/", |
| ipoib, eth_ntoa ( ethhdr->h_dest ), |
| inet_ntoa ( iphdr->dest ) ); |
| DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ), |
| strerror ( rc ) ); |
| return rc; |
| } |
| DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib, |
| eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) ); |
| DBGC ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) ); |
| return -ENXIO_ARP_SENT; |
| } |
| |
| /* Translate packet if applicable */ |
| if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 ) |
| return rc; |
| |
| /* Prepend real IPoIB header */ |
| ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) ); |
| ipoib_hdr->proto = net_proto; |
| ipoib_hdr->reserved = 0; |
| |
| /* Transmit packet */ |
| return ib_post_send ( ibdev, ipoib->qp, dest, iobuf ); |
| } |
| |
| /** |
| * Handle IPoIB send completion |
| * |
| * @v ibdev Infiniband device |
| * @v qp Queue pair |
| * @v iobuf I/O buffer |
| * @v rc Completion status code |
| */ |
| static void ipoib_complete_send ( struct ib_device *ibdev __unused, |
| struct ib_queue_pair *qp, |
| struct io_buffer *iobuf, int rc ) { |
| struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp ); |
| |
| netdev_tx_complete_err ( ipoib->netdev, iobuf, rc ); |
| } |
| |
| /** |
| * Handle IPoIB receive completion |
| * |
| * @v ibdev Infiniband device |
| * @v qp Queue pair |
| * @v dest Destination address vector, or NULL |
| * @v source Source address vector, or NULL |
| * @v iobuf I/O buffer |
| * @v rc Completion status code |
| */ |
| static void ipoib_complete_recv ( struct ib_device *ibdev __unused, |
| struct ib_queue_pair *qp, |
| struct ib_address_vector *dest, |
| struct ib_address_vector *source, |
| struct io_buffer *iobuf, int rc ) { |
| struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp ); |
| struct net_device *netdev = ipoib->netdev; |
| struct ipoib_hdr *ipoib_hdr; |
| struct ethhdr *ethhdr; |
| struct ipoib_remac remac; |
| uint16_t net_proto; |
| |
| /* Record errors */ |
| if ( rc != 0 ) { |
| netdev_rx_err ( netdev, iobuf, rc ); |
| return; |
| } |
| |
| /* Sanity check */ |
| if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) { |
| DBGC ( ipoib, "IPoIB %p received packet too short to " |
| "contain IPoIB header\n", ipoib ); |
| DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) ); |
| netdev_rx_err ( netdev, iobuf, -EIO ); |
| return; |
| } |
| if ( ! source ) { |
| DBGC ( ipoib, "IPoIB %p received packet without address " |
| "vector\n", ipoib ); |
| netdev_rx_err ( netdev, iobuf, -ENOTTY ); |
| return; |
| } |
| |
| /* Strip real IPoIB header */ |
| ipoib_hdr = iobuf->data; |
| net_proto = ipoib_hdr->proto; |
| iob_pull ( iobuf, sizeof ( *ipoib_hdr ) ); |
| |
| /* Construct source address from remote QPN and LID */ |
| remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA ); |
| remac.lid = htons ( source->lid ); |
| |
| /* Translate packet if applicable */ |
| if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac, |
| net_proto ) ) != 0 ) { |
| netdev_rx_err ( netdev, iobuf, rc ); |
| return; |
| } |
| |
| /* Prepend eIPoIB header */ |
| ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) ); |
| memcpy ( ðhdr->h_source, &remac, sizeof ( ethhdr->h_source ) ); |
| ethhdr->h_protocol = net_proto; |
| |
| /* Construct destination address */ |
| if ( dest->gid_present && IB_GID_MULTICAST ( &dest->gid ) ) { |
| /* Multicast GID: use the Ethernet broadcast address */ |
| memcpy ( ðhdr->h_dest, eth_broadcast, |
| sizeof ( ethhdr->h_dest ) ); |
| } else { |
| /* Assume destination address is local Ethernet MAC */ |
| memcpy ( ðhdr->h_dest, netdev->ll_addr, |
| sizeof ( ethhdr->h_dest ) ); |
| } |
| |
| /* Hand off to network layer */ |
| netdev_rx ( netdev, iobuf ); |
| } |
| |
| /** IPoIB completion operations */ |
| static struct ib_completion_queue_operations ipoib_cq_op = { |
| .complete_send = ipoib_complete_send, |
| .complete_recv = ipoib_complete_recv, |
| }; |
| |
| /** |
| * Allocate IPoIB receive I/O buffer |
| * |
| * @v len Length of buffer |
| * @ret iobuf I/O buffer, or NULL |
| * |
| * Some Infiniband hardware requires 2kB alignment of receive buffers |
| * and provides no way to disable header separation. The result is |
| * that there are only four bytes of link-layer header (the real IPoIB |
| * header) before the payload. This is not sufficient space to insert |
| * an eIPoIB link-layer pseudo-header. |
| * |
| * We therefore allocate I/O buffers offset to start slightly before |
| * the natural alignment boundary, in order to allow sufficient space. |
| */ |
| static struct io_buffer * ipoib_alloc_iob ( size_t len ) { |
| struct io_buffer *iobuf; |
| size_t reserve_len; |
| |
| /* Calculate additional length required at start of buffer */ |
| reserve_len = ( sizeof ( struct ethhdr ) - |
| sizeof ( struct ipoib_hdr ) ); |
| |
| /* Allocate buffer */ |
| iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len ); |
| if ( iobuf ) { |
| iob_reserve ( iobuf, reserve_len ); |
| } |
| return iobuf; |
| } |
| |
| /** IPoIB queue pair operations */ |
| static struct ib_queue_pair_operations ipoib_qp_op = { |
| .alloc_iob = ipoib_alloc_iob, |
| }; |
| |
| /** |
| * Poll IPoIB network device |
| * |
| * @v netdev Network device |
| */ |
| static void ipoib_poll ( struct net_device *netdev ) { |
| struct ipoib_device *ipoib = netdev->priv; |
| struct ib_device *ibdev = ipoib->ibdev; |
| |
| /* Poll Infiniband device */ |
| ib_poll_eq ( ibdev ); |
| |
| /* Poll the retry timers (required for IPoIB multicast join) */ |
| retry_poll(); |
| } |
| |
| /** |
| * Handle IPv4 broadcast multicast group join completion |
| * |
| * @v membership Multicast group membership |
| * @v rc Status code |
| */ |
| void ipoib_join_complete ( struct ib_mc_membership *membership, int rc ) { |
| struct ipoib_device *ipoib = container_of ( membership, |
| struct ipoib_device, |
| broadcast.membership ); |
| |
| /* Record join status as link status */ |
| netdev_link_err ( ipoib->netdev, rc ); |
| } |
| |
| /** |
| * Join IPv4 broadcast multicast group |
| * |
| * @v ipoib IPoIB device |
| * @ret rc Return status code |
| */ |
| static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) { |
| int rc; |
| |
| /* Join multicast group */ |
| if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp, |
| &ipoib->broadcast.membership, |
| &ipoib->broadcast.av, 0, |
| ipoib_join_complete ) ) != 0 ) { |
| DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n", |
| ipoib, strerror ( rc ) ); |
| return rc; |
| } |
| |
| return 0; |
| } |
| |
| /** |
| * Leave IPv4 broadcast multicast group |
| * |
| * @v ipoib IPoIB device |
| */ |
| static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) { |
| |
| /* Leave multicast group */ |
| ib_mcast_leave ( ipoib->ibdev, ipoib->qp, |
| &ipoib->broadcast.membership ); |
| } |
| |
| /** |
| * Handle link status change |
| * |
| * @v ipoib IPoIB device |
| */ |
| static void ipoib_link_state_changed ( struct ipoib_device *ipoib ) { |
| struct ib_device *ibdev = ipoib->ibdev; |
| struct net_device *netdev = ipoib->netdev; |
| int rc; |
| |
| /* Leave existing broadcast group */ |
| if ( ipoib->qp ) |
| ipoib_leave_broadcast_group ( ipoib ); |
| |
| /* Update MAC address based on potentially-new GID prefix */ |
| memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix, |
| sizeof ( ipoib->mac.gid.s.prefix ) ); |
| |
| /* Update broadcast MAC GID based on potentially-new partition key */ |
| ipoib->broadcast.mac.gid.words[2] = |
| htons ( ibdev->pkey | IB_PKEY_FULL ); |
| |
| /* Construct broadcast address vector from broadcast MAC address */ |
| memset ( &ipoib->broadcast.av, 0, sizeof ( ipoib->broadcast.av ) ); |
| ipoib->broadcast.av.qpn = IB_QPN_BROADCAST; |
| ipoib->broadcast.av.gid_present = 1; |
| memcpy ( &ipoib->broadcast.av.gid, &ipoib->broadcast.mac.gid, |
| sizeof ( ipoib->broadcast.av.gid ) ); |
| |
| /* Set net device link state to reflect Infiniband link state */ |
| rc = ib_link_rc ( ibdev ); |
| netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) ); |
| |
| /* Join new broadcast group */ |
| if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp && |
| ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) { |
| DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: " |
| "%s\n", ipoib, strerror ( rc ) ); |
| netdev_link_err ( netdev, rc ); |
| return; |
| } |
| } |
| |
| /** |
| * Open IPoIB network device |
| * |
| * @v netdev Network device |
| * @ret rc Return status code |
| */ |
| static int ipoib_open ( struct net_device *netdev ) { |
| struct ipoib_device *ipoib = netdev->priv; |
| struct ib_device *ibdev = ipoib->ibdev; |
| int rc; |
| |
| /* Open IB device */ |
| if ( ( rc = ib_open ( ibdev ) ) != 0 ) { |
| DBGC ( ipoib, "IPoIB %p could not open device: %s\n", |
| ipoib, strerror ( rc ) ); |
| goto err_ib_open; |
| } |
| |
| /* Allocate completion queue */ |
| if ( ( rc = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op, |
| &ipoib->cq ) ) != 0 ) { |
| DBGC ( ipoib, "IPoIB %p could not create completion queue: " |
| "%s\n", ipoib, strerror ( rc ) ); |
| goto err_create_cq; |
| } |
| |
| /* Allocate queue pair */ |
| if ( ( rc = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES, |
| ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq, |
| &ipoib_qp_op, netdev->name, |
| &ipoib->qp ) ) != 0 ) { |
| DBGC ( ipoib, "IPoIB %p could not create queue pair: %s\n", |
| ipoib, strerror ( rc ) ); |
| goto err_create_qp; |
| } |
| ib_qp_set_ownerdata ( ipoib->qp, ipoib ); |
| |
| /* Update MAC address with QPN */ |
| ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn ); |
| |
| /* Fill receive rings */ |
| ib_refill_recv ( ibdev, ipoib->qp ); |
| |
| /* Fake a link status change to join the broadcast group */ |
| ipoib_link_state_changed ( ipoib ); |
| |
| return 0; |
| |
| ib_destroy_qp ( ibdev, ipoib->qp ); |
| err_create_qp: |
| ib_destroy_cq ( ibdev, ipoib->cq ); |
| err_create_cq: |
| ib_close ( ibdev ); |
| err_ib_open: |
| return rc; |
| } |
| |
| /** |
| * Close IPoIB network device |
| * |
| * @v netdev Network device |
| */ |
| static void ipoib_close ( struct net_device *netdev ) { |
| struct ipoib_device *ipoib = netdev->priv; |
| struct ib_device *ibdev = ipoib->ibdev; |
| |
| /* Flush REMAC cache */ |
| ipoib_flush_remac ( ipoib ); |
| |
| /* Leave broadcast group */ |
| ipoib_leave_broadcast_group ( ipoib ); |
| |
| /* Remove QPN from MAC address */ |
| ipoib->mac.flags__qpn = 0; |
| |
| /* Tear down the queues */ |
| ib_destroy_qp ( ibdev, ipoib->qp ); |
| ipoib->qp = NULL; |
| ib_destroy_cq ( ibdev, ipoib->cq ); |
| ipoib->cq = NULL; |
| |
| /* Close IB device */ |
| ib_close ( ibdev ); |
| } |
| |
| /** IPoIB network device operations */ |
| static struct net_device_operations ipoib_operations = { |
| .open = ipoib_open, |
| .close = ipoib_close, |
| .transmit = ipoib_transmit, |
| .poll = ipoib_poll, |
| }; |
| |
| /** |
| * Probe IPoIB device |
| * |
| * @v ibdev Infiniband device |
| * @ret rc Return status code |
| */ |
| static int ipoib_probe ( struct ib_device *ibdev ) { |
| struct net_device *netdev; |
| struct ipoib_device *ipoib; |
| int rc; |
| |
| /* Allocate network device */ |
| netdev = alloc_ipoibdev ( sizeof ( *ipoib ) ); |
| if ( ! netdev ) |
| return -ENOMEM; |
| netdev_init ( netdev, &ipoib_operations ); |
| ipoib = netdev->priv; |
| netdev->dev = ibdev->dev; |
| memset ( ipoib, 0, sizeof ( *ipoib ) ); |
| ipoib->netdev = netdev; |
| ipoib->ibdev = ibdev; |
| INIT_LIST_HEAD ( &ipoib->peers ); |
| |
| /* Extract hardware address */ |
| memcpy ( netdev->hw_addr, &ibdev->gid.s.guid, |
| sizeof ( ibdev->gid.s.guid ) ); |
| memcpy ( netdev->ll_addr, ibdev->lemac, ETH_ALEN ); |
| |
| /* Set local MAC address */ |
| memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid, |
| sizeof ( ipoib->mac.gid.s.guid ) ); |
| |
| /* Set default broadcast MAC address */ |
| memcpy ( &ipoib->broadcast.mac, &ipoib_broadcast, |
| sizeof ( ipoib->broadcast.mac ) ); |
| |
| /* Add to list of IPoIB devices */ |
| list_add_tail ( &ipoib->list, &ipoib_devices ); |
| |
| /* Register network device */ |
| if ( ( rc = register_netdev ( netdev ) ) != 0 ) |
| goto err_register_netdev; |
| |
| return 0; |
| |
| unregister_netdev ( netdev ); |
| err_register_netdev: |
| list_del ( &ipoib->list ); |
| netdev_nullify ( netdev ); |
| netdev_put ( netdev ); |
| return rc; |
| } |
| |
| /** |
| * Handle device or link status change |
| * |
| * @v ibdev Infiniband device |
| */ |
| static void ipoib_notify ( struct ib_device *ibdev ) { |
| struct ipoib_device *ipoib; |
| |
| /* Handle link status change for any attached IPoIB devices */ |
| list_for_each_entry ( ipoib, &ipoib_devices, list ) { |
| if ( ipoib->ibdev != ibdev ) |
| continue; |
| ipoib_link_state_changed ( ipoib ); |
| } |
| } |
| |
| /** |
| * Remove IPoIB device |
| * |
| * @v ibdev Infiniband device |
| */ |
| static void ipoib_remove ( struct ib_device *ibdev ) { |
| struct ipoib_device *ipoib; |
| struct ipoib_device *tmp; |
| struct net_device *netdev; |
| |
| /* Remove any attached IPoIB devices */ |
| list_for_each_entry_safe ( ipoib, tmp, &ipoib_devices, list ) { |
| if ( ipoib->ibdev != ibdev ) |
| continue; |
| netdev = ipoib->netdev; |
| unregister_netdev ( netdev ); |
| list_del ( &ipoib->list ); |
| netdev_nullify ( netdev ); |
| netdev_put ( netdev ); |
| } |
| } |
| |
| /** IPoIB driver */ |
| struct ib_driver ipoib_driver __ib_driver = { |
| .name = "IPoIB", |
| .probe = ipoib_probe, |
| .notify = ipoib_notify, |
| .remove = ipoib_remove, |
| }; |
| |
| /** |
| * Find IPoIB network device |
| * |
| * @v ibdev Infiniband device |
| * @ret netdev IPoIB network device, or NULL if not found |
| */ |
| struct net_device * ipoib_netdev ( struct ib_device *ibdev ) { |
| struct ipoib_device *ipoib; |
| |
| /* Find matching IPoIB device */ |
| list_for_each_entry ( ipoib, &ipoib_devices, list ) { |
| if ( ipoib->ibdev != ibdev ) |
| continue; |
| return ipoib->netdev; |
| } |
| return NULL; |
| } |