From 30bf34f05c0f66d9d9221f7ebfe9808024337dd4 Mon Sep 17 00:00:00 2001 From: nbd Date: Sat, 22 Oct 2005 22:03:56 +0000 Subject: add pf_ring patches for kernel and libpcap git-svn-id: svn://svn.openwrt.org/openwrt/trunk/openwrt@2266 3c298f89-4303-0410-b956-a3cf2f4a3e73 --- .../linux-2.4/patches/generic/223-pf_ring.patch | 6444 ++++++++++++++++++++ 1 file changed, 6444 insertions(+) create mode 100644 target/linux/linux-2.4/patches/generic/223-pf_ring.patch (limited to 'target/linux/linux-2.4/patches') diff --git a/target/linux/linux-2.4/patches/generic/223-pf_ring.patch b/target/linux/linux-2.4/patches/generic/223-pf_ring.patch new file mode 100644 index 0000000000..1235e10445 --- /dev/null +++ b/target/linux/linux-2.4/patches/generic/223-pf_ring.patch @@ -0,0 +1,6444 @@ +diff --unified --recursive --new-file linux-2.4.30/include/linux/ring.h linux-2.4.30-1-686-smp-ring3/include/linux/ring.h +--- linux-2.4.30/include/linux/ring.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/include/linux/ring.h 2005-10-22 23:08:27.388011250 +0200 +@@ -0,0 +1,108 @@ ++/* ++ * Definitions for packet ring ++ * ++ * 2004 - Luca Deri ++ */ ++#ifndef __RING_H ++#define __RING_H ++ ++ ++#define INCLUDE_MAC_INFO ++ ++#ifdef INCLUDE_MAC_INFO ++#define SKB_DISPLACEMENT 14 /* Include MAC address information */ ++#else ++#define SKB_DISPLACEMENT 0 /* Do NOT include MAC address information */ ++#endif ++ ++#define RING_MAGIC ++#define RING_MAGIC_VALUE 0x88 ++#define RING_FLOWSLOT_VERSION 5 ++#define RING_VERSION "3.0" ++ ++#define SO_ADD_TO_CLUSTER 99 ++#define SO_REMOVE_FROM_CLUSTER 100 ++#define SO_SET_REFLECTOR 101 ++ ++/* *********************************** */ ++ ++#ifndef HAVE_PCAP ++struct pcap_pkthdr { ++ struct timeval ts; /* time stamp */ ++ u_int32_t caplen; /* length of portion present */ ++ u_int32_t len; /* length this packet (off wire) */ ++}; ++#endif ++ ++/* *********************************** */ ++ ++enum cluster_type { ++ cluster_per_flow = 0, ++ cluster_round_robin ++}; ++ ++/* *********************************** */ ++ ++#define RING_MIN_SLOT_SIZE (60+sizeof(struct pcap_pkthdr)) ++#define RING_MAX_SLOT_SIZE (1514+sizeof(struct pcap_pkthdr)) ++ ++/* *********************************** */ ++ ++typedef struct flowSlotInfo { ++ u_int16_t version, sample_rate; ++ u_int32_t tot_slots, slot_len, tot_mem; ++ ++ u_int64_t tot_pkts, tot_lost; ++ u_int64_t tot_insert, tot_read; ++ u_int16_t insert_idx; ++ u_int16_t remove_idx; ++} FlowSlotInfo; ++ ++/* *********************************** */ ++ ++typedef struct flowSlot { ++#ifdef RING_MAGIC ++ u_char magic; /* It must alwasy be zero */ ++#endif ++ u_char slot_state; /* 0=empty, 1=full */ ++ u_char bucket; /* bucket[bucketLen] */ ++} FlowSlot; ++ ++/* *********************************** */ ++ ++#ifdef __KERNEL__ ++ ++FlowSlotInfo* getRingPtr(void); ++int allocateRing(char *deviceName, u_int numSlots, ++ u_int bucketLen, u_int sampleRate); ++unsigned int pollRing(struct file *fp, struct poll_table_struct * wait); ++void deallocateRing(void); ++ ++/* ************************* */ ++ ++typedef int (*handle_ring_skb)(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb); ++extern handle_ring_skb get_skb_ring_handler(void); ++extern void set_skb_ring_handler(handle_ring_skb the_handler); ++extern void do_skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb); ++ ++typedef int (*handle_ring_buffer)(struct net_device *dev, ++ char *data, int len); ++extern handle_ring_buffer get_buffer_ring_handler(void); ++extern void set_buffer_ring_handler(handle_ring_buffer the_handler); ++extern int do_buffer_ring_handler(struct net_device *dev, ++ char *data, int len); ++#endif /* __KERNEL__ */ ++ ++/* *********************************** */ ++ ++#define PF_RING 27 /* Packet Ring */ ++#define SOCK_RING PF_RING ++ ++/* ioctl() */ ++#define SIORINGPOLL 0x8888 ++ ++/* *********************************** */ ++ ++#endif /* __RING_H */ +diff --unified --recursive --new-file linux-2.4.30/include/net/sock.h linux-2.4.30-1-686-smp-ring3/include/net/sock.h +--- linux-2.4.30/include/net/sock.h 2004-11-17 12:54:22.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/include/net/sock.h 2005-10-22 23:08:27.976048000 +0200 +@@ -699,6 +699,9 @@ + #if defined (CONFIG_PACKET) || defined(CONFIG_PACKET_MODULE) + struct packet_opt *af_packet; + #endif ++#if defined(CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ struct ring_opt *pf_ring; ++#endif + #if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE) + x25_cb *x25; + #endif +diff --unified --recursive --new-file linux-2.4.30/include/net/sock.h.ORG linux-2.4.30-1-686-smp-ring3/include/net/sock.h.ORG +--- linux-2.4.30/include/net/sock.h.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/include/net/sock.h.ORG 2005-10-22 23:08:27.940045750 +0200 +@@ -0,0 +1,1400 @@ ++/* ++ * INET An implementation of the TCP/IP protocol suite for the LINUX ++ * operating system. INET is implemented using the BSD Socket ++ * interface as the means of communication with the user level. ++ * ++ * Definitions for the AF_INET socket handler. ++ * ++ * Version: @(#)sock.h 1.0.4 05/13/93 ++ * ++ * Authors: Ross Biro, ++ * Fred N. van Kempen, ++ * Corey Minyard ++ * Florian La Roche ++ * ++ * Fixes: ++ * Alan Cox : Volatiles in skbuff pointers. See ++ * skbuff comments. May be overdone, ++ * better to prove they can be removed ++ * than the reverse. ++ * Alan Cox : Added a zapped field for tcp to note ++ * a socket is reset and must stay shut up ++ * Alan Cox : New fields for options ++ * Pauline Middelink : identd support ++ * Alan Cox : Eliminate low level recv/recvfrom ++ * David S. Miller : New socket lookup architecture. ++ * Steve Whitehouse: Default routines for sock_ops ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++#ifndef _SOCK_H ++#define _SOCK_H ++ ++#include ++#include ++#include ++#include /* struct sockaddr_in */ ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++#include /* struct sockaddr_in6 */ ++#include /* dest_cache, inet6_options */ ++#include ++#include /* struct ipv6_mc_socklist */ ++#endif ++ ++#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) ++#include ++#endif ++#include /* struct tcphdr */ ++#if defined(CONFIG_IP_SCTP) || defined (CONFIG_IP_SCTP_MODULE) ++#include /* struct sctp_opt */ ++#endif ++ ++#include ++#include /* struct sk_buff */ ++#include /* struct inet_protocol */ ++#if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE) ++#include ++#endif ++#if defined(CONFIG_WAN_ROUTER) || defined(CONFIG_WAN_ROUTER_MODULE) ++#include ++#endif ++ ++#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) ++#include ++#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) ++#include ++#endif ++#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) ++#include ++#endif ++#endif ++ ++#if defined(CONFIG_PPPOE) || defined(CONFIG_PPPOE_MODULE) ++#include ++#include /* struct ppp_channel */ ++#endif ++ ++#if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) ++#if defined(CONFIG_SPX) || defined(CONFIG_SPX_MODULE) ++#include ++#else ++#include ++#endif /* CONFIG_SPX */ ++#endif /* CONFIG_IPX */ ++ ++#if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE) ++#include ++#endif ++ ++#if defined(CONFIG_DECNET) || defined(CONFIG_DECNET_MODULE) ++#include ++#endif ++ ++#if defined(CONFIG_IRDA) || defined(CONFIG_IRDA_MODULE) ++#include ++#endif ++ ++#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) ++struct atm_vcc; ++#endif ++ ++#ifdef CONFIG_FILTER ++#include ++#endif ++ ++#include ++#include ++ ++ ++/* The AF_UNIX specific socket options */ ++struct unix_opt { ++ struct unix_address *addr; ++ struct dentry * dentry; ++ struct vfsmount * mnt; ++ struct semaphore readsem; ++ struct sock * other; ++ struct sock ** list; ++ struct sock * gc_tree; ++ atomic_t inflight; ++ rwlock_t lock; ++ wait_queue_head_t peer_wait; ++}; ++ ++ ++/* Once the IPX ncpd patches are in these are going into protinfo. */ ++#if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) ++struct ipx_opt { ++ ipx_address dest_addr; ++ ipx_interface *intrfc; ++ unsigned short port; ++#ifdef CONFIG_IPX_INTERN ++ unsigned char node[IPX_NODE_LEN]; ++#endif ++ unsigned short type; ++/* ++ * To handle special ncp connection-handling sockets for mars_nwe, ++ * the connection number must be stored in the socket. ++ */ ++ unsigned short ipx_ncp_conn; ++}; ++#endif ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++struct ipv6_pinfo { ++ struct in6_addr saddr; ++ struct in6_addr rcv_saddr; ++ struct in6_addr daddr; ++ struct in6_addr *daddr_cache; ++ ++ __u32 flow_label; ++ __u32 frag_size; ++ int hop_limit; ++ int mcast_hops; ++ int mcast_oif; ++ ++ /* pktoption flags */ ++ union { ++ struct { ++ __u8 srcrt:2, ++ rxinfo:1, ++ rxhlim:1, ++ hopopts:1, ++ dstopts:1, ++ authhdr:1, ++ rxflow:1; ++ } bits; ++ __u8 all; ++ } rxopt; ++ ++ /* sockopt flags */ ++ __u8 mc_loop:1, ++ recverr:1, ++ sndflow:1, ++ pmtudisc:2, ++ ipv6only:1; ++ ++ struct ipv6_mc_socklist *ipv6_mc_list; ++ struct ipv6_ac_socklist *ipv6_ac_list; ++ struct ipv6_fl_socklist *ipv6_fl_list; ++ __u32 dst_cookie; ++ ++ struct ipv6_txoptions *opt; ++ struct sk_buff *pktoptions; ++}; ++ ++struct raw6_opt { ++ __u32 checksum; /* perform checksum */ ++ __u32 offset; /* checksum offset */ ++ ++ struct icmp6_filter filter; ++}; ++ ++#define __ipv6_only_sock(sk) ((sk)->net_pinfo.af_inet6.ipv6only) ++#define ipv6_only_sock(sk) ((sk)->family == PF_INET6 && \ ++ (sk)->net_pinfo.af_inet6.ipv6only) ++#else ++#define __ipv6_only_sock(sk) 0 ++#define ipv6_only_sock(sk) 0 ++#endif /* IPV6 */ ++ ++#if defined(CONFIG_INET) || defined(CONFIG_INET_MODULE) ++struct raw_opt { ++ struct icmp_filter filter; ++}; ++#endif ++ ++#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) ++struct inet_opt ++{ ++ int ttl; /* TTL setting */ ++ int tos; /* TOS */ ++ unsigned cmsg_flags; ++ struct ip_options *opt; ++ unsigned char hdrincl; /* Include headers ? */ ++ __u8 mc_ttl; /* Multicasting TTL */ ++ __u8 mc_loop; /* Loopback */ ++ unsigned recverr : 1, ++ freebind : 1; ++ __u16 id; /* ID counter for DF pkts */ ++ __u8 pmtudisc; ++ int mc_index; /* Multicast device index */ ++ __u32 mc_addr; ++ struct ip_mc_socklist *mc_list; /* Group array */ ++}; ++#endif ++ ++#if defined(CONFIG_PPPOE) || defined (CONFIG_PPPOE_MODULE) ++struct pppoe_opt ++{ ++ struct net_device *dev; /* device associated with socket*/ ++ struct pppoe_addr pa; /* what this socket is bound to*/ ++ struct sockaddr_pppox relay; /* what socket data will be ++ relayed to (PPPoE relaying) */ ++}; ++ ++struct pppox_opt ++{ ++ struct ppp_channel chan; ++ struct sock *sk; ++ struct pppox_opt *next; /* for hash table */ ++ union { ++ struct pppoe_opt pppoe; ++ } proto; ++}; ++#define pppoe_dev proto.pppoe.dev ++#define pppoe_pa proto.pppoe.pa ++#define pppoe_relay proto.pppoe.relay ++#endif ++ ++/* This defines a selective acknowledgement block. */ ++struct tcp_sack_block { ++ __u32 start_seq; ++ __u32 end_seq; ++}; ++ ++enum tcp_congestion_algo { ++ TCP_RENO=0, ++ TCP_VEGAS, ++ TCP_WESTWOOD, ++ TCP_BIC, ++}; ++ ++struct tcp_opt { ++ int tcp_header_len; /* Bytes of tcp header to send */ ++ ++/* ++ * Header prediction flags ++ * 0x5?10 << 16 + snd_wnd in net byte order ++ */ ++ __u32 pred_flags; ++ ++/* ++ * RFC793 variables by their proper names. This means you can ++ * read the code and the spec side by side (and laugh ...) ++ * See RFC793 and RFC1122. The RFC writes these in capitals. ++ */ ++ __u32 rcv_nxt; /* What we want to receive next */ ++ __u32 snd_nxt; /* Next sequence we send */ ++ ++ __u32 snd_una; /* First byte we want an ack for */ ++ __u32 snd_sml; /* Last byte of the most recently transmitted small packet */ ++ __u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ ++ __u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ ++ ++ /* Delayed ACK control data */ ++ struct { ++ __u8 pending; /* ACK is pending */ ++ __u8 quick; /* Scheduled number of quick acks */ ++ __u8 pingpong; /* The session is interactive */ ++ __u8 blocked; /* Delayed ACK was blocked by socket lock*/ ++ __u32 ato; /* Predicted tick of soft clock */ ++ unsigned long timeout; /* Currently scheduled timeout */ ++ __u32 lrcvtime; /* timestamp of last received data packet*/ ++ __u16 last_seg_size; /* Size of last incoming segment */ ++ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ ++ } ack; ++ ++ /* Data for direct copy to user */ ++ struct { ++ struct sk_buff_head prequeue; ++ struct task_struct *task; ++ struct iovec *iov; ++ int memory; ++ int len; ++ } ucopy; ++ ++ __u32 snd_wl1; /* Sequence for window update */ ++ __u32 snd_wnd; /* The window we expect to receive */ ++ __u32 max_window; /* Maximal window ever seen from peer */ ++ __u32 pmtu_cookie; /* Last pmtu seen by socket */ ++ __u16 mss_cache; /* Cached effective mss, not including SACKS */ ++ __u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ ++ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ ++ __u8 ca_state; /* State of fast-retransmit machine */ ++ __u8 retransmits; /* Number of unrecovered RTO timeouts. */ ++ ++ __u8 reordering; /* Packet reordering metric. */ ++ __u8 queue_shrunk; /* Write queue has been shrunk recently.*/ ++ __u8 defer_accept; /* User waits for some data after accept() */ ++ ++/* RTT measurement */ ++ __u8 backoff; /* backoff */ ++ __u32 srtt; /* smothed round trip time << 3 */ ++ __u32 mdev; /* medium deviation */ ++ __u32 mdev_max; /* maximal mdev for the last rtt period */ ++ __u32 rttvar; /* smoothed mdev_max */ ++ __u32 rtt_seq; /* sequence number to update rttvar */ ++ __u32 rto; /* retransmit timeout */ ++ ++ __u32 packets_out; /* Packets which are "in flight" */ ++ __u32 left_out; /* Packets which leaved network */ ++ __u32 retrans_out; /* Retransmitted packets out */ ++ ++ ++/* ++ * Slow start and congestion control (see also Nagle, and Karn & Partridge) ++ */ ++ __u32 snd_ssthresh; /* Slow start size threshold */ ++ __u32 snd_cwnd; /* Sending congestion window */ ++ __u16 snd_cwnd_cnt; /* Linear increase counter */ ++ __u16 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ ++ __u32 snd_cwnd_used; ++ __u32 snd_cwnd_stamp; ++ ++ /* Two commonly used timers in both sender and receiver paths. */ ++ unsigned long timeout; ++ struct timer_list retransmit_timer; /* Resend (no ack) */ ++ struct timer_list delack_timer; /* Ack delay */ ++ ++ struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ ++ ++ struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ ++ struct sk_buff *send_head; /* Front of stuff to transmit */ ++ struct page *sndmsg_page; /* Cached page for sendmsg */ ++ u32 sndmsg_off; /* Cached offset for sendmsg */ ++ ++ __u32 rcv_wnd; /* Current receiver window */ ++ __u32 rcv_wup; /* rcv_nxt on last window update sent */ ++ __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ ++ __u32 pushed_seq; /* Last pushed seq, required to talk to windows */ ++ __u32 copied_seq; /* Head of yet unread data */ ++/* ++ * Options received (usually on last packet, some only on SYN packets). ++ */ ++ char tstamp_ok, /* TIMESTAMP seen on SYN packet */ ++ wscale_ok, /* Wscale seen on SYN packet */ ++ sack_ok; /* SACK seen on SYN packet */ ++ char saw_tstamp; /* Saw TIMESTAMP on last packet */ ++ __u8 snd_wscale; /* Window scaling received from sender */ ++ __u8 rcv_wscale; /* Window scaling to send to receiver */ ++ __u8 nonagle; /* Disable Nagle algorithm? */ ++ __u8 keepalive_probes; /* num of allowed keep alive probes */ ++ ++/* PAWS/RTTM data */ ++ __u32 rcv_tsval; /* Time stamp value */ ++ __u32 rcv_tsecr; /* Time stamp echo reply */ ++ __u32 ts_recent; /* Time stamp to echo next */ ++ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ ++ ++/* SACKs data */ ++ __u16 user_mss; /* mss requested by user in ioctl */ ++ __u8 dsack; /* D-SACK is scheduled */ ++ __u8 eff_sacks; /* Size of SACK array to send with next packet */ ++ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ ++ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ ++ ++ __u32 window_clamp; /* Maximal window to advertise */ ++ __u32 rcv_ssthresh; /* Current window clamp */ ++ __u8 probes_out; /* unanswered 0 window probes */ ++ __u8 num_sacks; /* Number of SACK blocks */ ++ __u16 advmss; /* Advertised MSS */ ++ ++ __u8 syn_retries; /* num of allowed syn retries */ ++ __u8 ecn_flags; /* ECN status bits. */ ++ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ ++ __u32 lost_out; /* Lost packets */ ++ __u32 sacked_out; /* SACK'd packets */ ++ __u32 fackets_out; /* FACK'd packets */ ++ __u32 high_seq; /* snd_nxt at onset of congestion */ ++ ++ __u32 retrans_stamp; /* Timestamp of the last retransmit, ++ * also used in SYN-SENT to remember stamp of ++ * the first SYN. */ ++ __u32 undo_marker; /* tracking retrans started here. */ ++ int undo_retrans; /* number of undoable retransmissions. */ ++ __u32 urg_seq; /* Seq of received urgent pointer */ ++ __u16 urg_data; /* Saved octet of OOB data and control flags */ ++ __u8 pending; /* Scheduled timer event */ ++ __u8 urg_mode; /* In urgent mode */ ++ __u32 snd_up; /* Urgent pointer */ ++ ++ /* The syn_wait_lock is necessary only to avoid tcp_get_info having ++ * to grab the main lock sock while browsing the listening hash ++ * (otherwise it's deadlock prone). ++ * This lock is acquired in read mode only from tcp_get_info() and ++ * it's acquired in write mode _only_ from code that is actively ++ * changing the syn_wait_queue. All readers that are holding ++ * the master sock lock don't need to grab this lock in read mode ++ * too as the syn_wait_queue writes are always protected from ++ * the main sock lock. ++ */ ++ rwlock_t syn_wait_lock; ++ struct tcp_listen_opt *listen_opt; ++ ++ /* FIFO of established children */ ++ struct open_request *accept_queue; ++ struct open_request *accept_queue_tail; ++ ++ int write_pending; /* A write to socket waits to start. */ ++ ++ unsigned int keepalive_time; /* time before keep alive takes place */ ++ unsigned int keepalive_intvl; /* time interval between keep alive probes */ ++ int linger2; ++ ++ __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ ++ __u8 frto_counter; /* Number of new acks after RTO */ ++ __u32 frto_highmark; /* snd_nxt when RTO occurred */ ++ ++ unsigned long last_synq_overflow; ++ ++/* Receiver side RTT estimation */ ++ struct { ++ __u32 rtt; ++ __u32 seq; ++ __u32 time; ++ } rcv_rtt_est; ++ ++/* Receiver queue space */ ++ struct { ++ int space; ++ __u32 seq; ++ __u32 time; ++ } rcvq_space; ++ ++/* TCP Westwood structure */ ++ struct { ++ __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ ++ __u32 bw_est; /* bandwidth estimate */ ++ __u32 rtt_win_sx; /* here starts a new evaluation... */ ++ __u32 bk; ++ __u32 snd_una; /* used for evaluating the number of acked bytes */ ++ __u32 cumul_ack; ++ __u32 accounted; ++ __u32 rtt; ++ __u32 rtt_min; /* minimum observed RTT */ ++ } westwood; ++ ++/* Vegas variables */ ++ struct { ++ __u32 beg_snd_nxt; /* right edge during last RTT */ ++ __u32 beg_snd_una; /* left edge during last RTT */ ++ __u32 beg_snd_cwnd; /* saves the size of the cwnd */ ++ __u8 doing_vegas_now;/* if true, do vegas for this RTT */ ++ __u16 cntRTT; /* # of RTTs measured within last RTT */ ++ __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ ++ __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ ++ } vegas; ++ ++ /* BI TCP Parameters */ ++ struct { ++ __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ ++ __u32 last_max_cwnd; /* last maximium snd_cwnd */ ++ __u32 last_cwnd; /* the last snd_cwnd */ ++ __u32 last_stamp; /* time when updated last_cwnd */ ++ } bictcp; ++}; ++ ++ ++/* ++ * This structure really needs to be cleaned up. ++ * Most of it is for TCP, and not used by any of ++ * the other protocols. ++ */ ++ ++/* ++ * The idea is to start moving to a newer struct gradualy ++ * ++ * IMHO the newer struct should have the following format: ++ * ++ * struct sock { ++ * sockmem [mem, proto, callbacks] ++ * ++ * union or struct { ++ * ax25; ++ * } ll_pinfo; ++ * ++ * union { ++ * ipv4; ++ * ipv6; ++ * ipx; ++ * netrom; ++ * rose; ++ * x25; ++ * } net_pinfo; ++ * ++ * union { ++ * tcp; ++ * udp; ++ * spx; ++ * netrom; ++ * } tp_pinfo; ++ * ++ * } ++ * ++ * The idea failed because IPv6 transition asssumes dual IP/IPv6 sockets. ++ * So, net_pinfo is IPv6 are really, and protinfo unifies all another ++ * private areas. ++ */ ++ ++/* Define this to get the sk->debug debugging facility. */ ++#define SOCK_DEBUGGING ++#ifdef SOCK_DEBUGGING ++#define SOCK_DEBUG(sk, msg...) do { if((sk) && ((sk)->debug)) printk(KERN_DEBUG msg); } while (0) ++#else ++#define SOCK_DEBUG(sk, msg...) do { } while (0) ++#endif ++ ++/* This is the per-socket lock. The spinlock provides a synchronization ++ * between user contexts and software interrupt processing, whereas the ++ * mini-semaphore synchronizes multiple users amongst themselves. ++ */ ++typedef struct { ++ spinlock_t slock; ++ unsigned int users; ++ wait_queue_head_t wq; ++} socket_lock_t; ++ ++#define sock_lock_init(__sk) \ ++do { spin_lock_init(&((__sk)->lock.slock)); \ ++ (__sk)->lock.users = 0; \ ++ init_waitqueue_head(&((__sk)->lock.wq)); \ ++} while(0) ++ ++struct sock { ++ /* Socket demultiplex comparisons on incoming packets. */ ++ __u32 daddr; /* Foreign IPv4 addr */ ++ __u32 rcv_saddr; /* Bound local IPv4 addr */ ++ __u16 dport; /* Destination port */ ++ unsigned short num; /* Local port */ ++ int bound_dev_if; /* Bound device index if != 0 */ ++ ++ /* Main hash linkage for various protocol lookup tables. */ ++ struct sock *next; ++ struct sock **pprev; ++ struct sock *bind_next; ++ struct sock **bind_pprev; ++ ++ volatile unsigned char state, /* Connection state */ ++ zapped; /* In ax25 & ipx means not linked */ ++ __u16 sport; /* Source port */ ++ ++ unsigned short family; /* Address family */ ++ unsigned char reuse; /* SO_REUSEADDR setting */ ++ unsigned char shutdown; ++ atomic_t refcnt; /* Reference count */ ++ ++ socket_lock_t lock; /* Synchronizer... */ ++ int rcvbuf; /* Size of receive buffer in bytes */ ++ ++ wait_queue_head_t *sleep; /* Sock wait queue */ ++ struct dst_entry *dst_cache; /* Destination cache */ ++ rwlock_t dst_lock; ++ atomic_t rmem_alloc; /* Receive queue bytes committed */ ++ struct sk_buff_head receive_queue; /* Incoming packets */ ++ atomic_t wmem_alloc; /* Transmit queue bytes committed */ ++ struct sk_buff_head write_queue; /* Packet sending queue */ ++ atomic_t omem_alloc; /* "o" is "option" or "other" */ ++ int wmem_queued; /* Persistent queue size */ ++ int forward_alloc; /* Space allocated forward. */ ++ __u32 saddr; /* Sending source */ ++ unsigned int allocation; /* Allocation mode */ ++ int sndbuf; /* Size of send buffer in bytes */ ++ struct sock *prev; ++ ++ /* Not all are volatile, but some are, so we might as well say they all are. ++ * XXX Make this a flag word -DaveM ++ */ ++ volatile char dead, ++ done, ++ urginline, ++ keepopen, ++ linger, ++ destroy, ++ no_check, ++ broadcast, ++ bsdism; ++ unsigned char debug; ++ unsigned char rcvtstamp; ++ unsigned char use_write_queue; ++ unsigned char userlocks; ++ /* Hole of 3 bytes. Try to pack. */ ++ int route_caps; ++ int proc; ++ unsigned long lingertime; ++ ++ int hashent; ++ struct sock *pair; ++ ++ /* The backlog queue is special, it is always used with ++ * the per-socket spinlock held and requires low latency ++ * access. Therefore we special case it's implementation. ++ */ ++ struct { ++ struct sk_buff *head; ++ struct sk_buff *tail; ++ } backlog; ++ ++ rwlock_t callback_lock; ++ ++ /* Error queue, rarely used. */ ++ struct sk_buff_head error_queue; ++ ++ struct proto *prot; ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ union { ++ struct ipv6_pinfo af_inet6; ++ } net_pinfo; ++#endif ++ ++ union { ++ struct tcp_opt af_tcp; ++#if defined(CONFIG_IP_SCTP) || defined (CONFIG_IP_SCTP_MODULE) ++ struct sctp_opt af_sctp; ++#endif ++#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) ++ struct raw_opt tp_raw4; ++#endif ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ struct raw6_opt tp_raw; ++#endif /* CONFIG_IPV6 */ ++#if defined(CONFIG_SPX) || defined (CONFIG_SPX_MODULE) ++ struct spx_opt af_spx; ++#endif /* CONFIG_SPX */ ++ ++ } tp_pinfo; ++ ++ int err, err_soft; /* Soft holds errors that don't ++ cause failure but are the cause ++ of a persistent failure not just ++ 'timed out' */ ++ unsigned short ack_backlog; ++ unsigned short max_ack_backlog; ++ __u32 priority; ++ unsigned short type; ++ unsigned char localroute; /* Route locally only */ ++ unsigned char protocol; ++ struct ucred peercred; ++ int rcvlowat; ++ long rcvtimeo; ++ long sndtimeo; ++ ++#ifdef CONFIG_FILTER ++ /* Socket Filtering Instructions */ ++ struct sk_filter *filter; ++#endif /* CONFIG_FILTER */ ++ ++ /* This is where all the private (optional) areas that don't ++ * overlap will eventually live. ++ */ ++ union { ++ void *destruct_hook; ++ struct unix_opt af_unix; ++#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) ++ struct inet_opt af_inet; ++#endif ++#if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE) ++ struct atalk_sock af_at; ++#endif ++#if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) ++ struct ipx_opt af_ipx; ++#endif ++#if defined (CONFIG_DECNET) || defined(CONFIG_DECNET_MODULE) ++ struct dn_scp dn; ++#endif ++#if defined (CONFIG_PACKET) || defined(CONFIG_PACKET_MODULE) ++ struct packet_opt *af_packet; ++#endif ++#if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE) ++ x25_cb *x25; ++#endif ++#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) ++ ax25_cb *ax25; ++#endif ++#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) ++ nr_cb *nr; ++#endif ++#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) ++ rose_cb *rose; ++#endif ++#if defined(CONFIG_PPPOE) || defined(CONFIG_PPPOE_MODULE) ++ struct pppox_opt *pppox; ++#endif ++ struct netlink_opt *af_netlink; ++#if defined(CONFIG_ECONET) || defined(CONFIG_ECONET_MODULE) ++ struct econet_opt *af_econet; ++#endif ++#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) ++ struct atm_vcc *af_atm; ++#endif ++#if defined(CONFIG_IRDA) || defined(CONFIG_IRDA_MODULE) ++ struct irda_sock *irda; ++#endif ++#if defined(CONFIG_WAN_ROUTER) || defined(CONFIG_WAN_ROUTER_MODULE) ++ struct wanpipe_opt *af_wanpipe; ++#endif ++ } protinfo; ++ ++ ++ /* This part is used for the timeout functions. */ ++ struct timer_list timer; /* This is the sock cleanup timer. */ ++ struct timeval stamp; ++ ++ /* Identd and reporting IO signals */ ++ struct socket *socket; ++ ++ /* RPC layer private data */ ++ void *user_data; ++ ++ /* Callbacks */ ++ void (*state_change)(struct sock *sk); ++ void (*data_ready)(struct sock *sk,int bytes); ++ void (*write_space)(struct sock *sk); ++ void (*error_report)(struct sock *sk); ++ ++ int (*backlog_rcv) (struct sock *sk, ++ struct sk_buff *skb); ++ void (*destruct)(struct sock *sk); ++}; ++ ++/* The per-socket spinlock must be held here. */ ++#define sk_add_backlog(__sk, __skb) \ ++do { if((__sk)->backlog.tail == NULL) { \ ++ (__sk)->backlog.head = \ ++ (__sk)->backlog.tail = (__skb); \ ++ } else { \ ++ ((__sk)->backlog.tail)->next = (__skb); \ ++ (__sk)->backlog.tail = (__skb); \ ++ } \ ++ (__skb)->next = NULL; \ ++} while(0) ++ ++/* IP protocol blocks we attach to sockets. ++ * socket layer -> transport layer interface ++ * transport -> network interface is defined by struct inet_proto ++ */ ++struct proto { ++ void (*close)(struct sock *sk, ++ long timeout); ++ int (*connect)(struct sock *sk, ++ struct sockaddr *uaddr, ++ int addr_len); ++ int (*disconnect)(struct sock *sk, int flags); ++ ++ struct sock * (*accept) (struct sock *sk, int flags, int *err); ++ ++ int (*ioctl)(struct sock *sk, int cmd, ++ unsigned long arg); ++ int (*init)(struct sock *sk); ++ int (*destroy)(struct sock *sk); ++ void (*shutdown)(struct sock *sk, int how); ++ int (*setsockopt)(struct sock *sk, int level, ++ int optname, char *optval, int optlen); ++ int (*getsockopt)(struct sock *sk, int level, ++ int optname, char *optval, ++ int *option); ++ int (*sendmsg)(struct sock *sk, struct msghdr *msg, ++ int len); ++ int (*recvmsg)(struct sock *sk, struct msghdr *msg, ++ int len, int noblock, int flags, ++ int *addr_len); ++ int (*bind)(struct sock *sk, ++ struct sockaddr *uaddr, int addr_len); ++ ++ int (*backlog_rcv) (struct sock *sk, ++ struct sk_buff *skb); ++ ++ /* Keeping track of sk's, looking them up, and port selection methods. */ ++ void (*hash)(struct sock *sk); ++ void (*unhash)(struct sock *sk); ++ int (*get_port)(struct sock *sk, unsigned short snum); ++ ++ char name[32]; ++ ++ struct { ++ int inuse; ++ u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; ++ } stats[NR_CPUS]; ++}; ++ ++/* Called with local bh disabled */ ++static __inline__ void sock_prot_inc_use(struct proto *prot) ++{ ++ prot->stats[smp_processor_id()].inuse++; ++} ++ ++static __inline__ void sock_prot_dec_use(struct proto *prot) ++{ ++ prot->stats[smp_processor_id()].inuse--; ++} ++ ++/* About 10 seconds */ ++#define SOCK_DESTROY_TIME (10*HZ) ++ ++/* Sockets 0-1023 can't be bound to unless you are superuser */ ++#define PROT_SOCK 1024 ++ ++#define SHUTDOWN_MASK 3 ++#define RCV_SHUTDOWN 1 ++#define SEND_SHUTDOWN 2 ++ ++#define SOCK_SNDBUF_LOCK 1 ++#define SOCK_RCVBUF_LOCK 2 ++#define SOCK_BINDADDR_LOCK 4 ++#define SOCK_BINDPORT_LOCK 8 ++ ++ ++/* Used by processes to "lock" a socket state, so that ++ * interrupts and bottom half handlers won't change it ++ * from under us. It essentially blocks any incoming ++ * packets, so that we won't get any new data or any ++ * packets that change the state of the socket. ++ * ++ * While locked, BH processing will add new packets to ++ * the backlog queue. This queue is processed by the ++ * owner of the socket lock right before it is released. ++ * ++ * Since ~2.3.5 it is also exclusive sleep lock serializing ++ * accesses from user process context. ++ */ ++extern void __lock_sock(struct sock *sk); ++extern void __release_sock(struct sock *sk); ++#define lock_sock(__sk) \ ++do { spin_lock_bh(&((__sk)->lock.slock)); \ ++ if ((__sk)->lock.users != 0) \ ++ __lock_sock(__sk); \ ++ (__sk)->lock.users = 1; \ ++ spin_unlock_bh(&((__sk)->lock.slock)); \ ++} while(0) ++ ++#define release_sock(__sk) \ ++do { spin_lock_bh(&((__sk)->lock.slock)); \ ++ if ((__sk)->backlog.tail != NULL) \ ++ __release_sock(__sk); \ ++ (__sk)->lock.users = 0; \ ++ if (waitqueue_active(&((__sk)->lock.wq))) wake_up(&((__sk)->lock.wq)); \ ++ spin_unlock_bh(&((__sk)->lock.slock)); \ ++} while(0) ++ ++/* BH context may only use the following locking interface. */ ++#define bh_lock_sock(__sk) spin_lock(&((__sk)->lock.slock)) ++#define bh_unlock_sock(__sk) spin_unlock(&((__sk)->lock.slock)) ++ ++extern struct sock * sk_alloc(int family, int priority, int zero_it); ++extern void sk_free(struct sock *sk); ++ ++extern struct sk_buff *sock_wmalloc(struct sock *sk, ++ unsigned long size, int force, ++ int priority); ++extern struct sk_buff *sock_rmalloc(struct sock *sk, ++ unsigned long size, int force, ++ int priority); ++extern void sock_wfree(struct sk_buff *skb); ++extern void sock_rfree(struct sk_buff *skb); ++ ++extern int sock_setsockopt(struct socket *sock, int level, ++ int op, char *optval, ++ int optlen); ++ ++extern int sock_getsockopt(struct socket *sock, int level, ++ int op, char *optval, ++ int *optlen); ++extern struct sk_buff *sock_alloc_send_skb(struct sock *sk, ++ unsigned long size, ++ int noblock, ++ int *errcode); ++extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk, ++ unsigned long header_len, ++ unsigned long data_len, ++ int noblock, ++ int *errcode); ++extern void *sock_kmalloc(struct sock *sk, int size, int priority); ++extern void sock_kfree_s(struct sock *sk, void *mem, int size); ++ ++/* ++ * Functions to fill in entries in struct proto_ops when a protocol ++ * does not implement a particular function. ++ */ ++extern int sock_no_release(struct socket *); ++extern int sock_no_bind(struct socket *, ++ struct sockaddr *, int); ++extern int sock_no_connect(struct socket *, ++ struct sockaddr *, int, int); ++extern int sock_no_socketpair(struct socket *, ++ struct socket *); ++extern int sock_no_accept(struct socket *, ++ struct socket *, int); ++extern int sock_no_getname(struct socket *, ++ struct sockaddr *, int *, int); ++extern unsigned int sock_no_poll(struct file *, struct socket *, ++ struct poll_table_struct *); ++extern int sock_no_ioctl(struct socket *, unsigned int, ++ unsigned long); ++extern int sock_no_listen(struct socket *, int); ++extern int sock_no_shutdown(struct socket *, int); ++extern int sock_no_getsockopt(struct socket *, int , int, ++ char *, int *); ++extern int sock_no_setsockopt(struct socket *, int, int, ++ char *, int); ++extern int sock_no_fcntl(struct socket *, ++ unsigned int, unsigned long); ++extern int sock_no_sendmsg(struct socket *, ++ struct msghdr *, int, ++ struct scm_cookie *); ++extern int sock_no_recvmsg(struct socket *, ++ struct msghdr *, int, int, ++ struct scm_cookie *); ++extern int sock_no_mmap(struct file *file, ++ struct socket *sock, ++ struct vm_area_struct *vma); ++extern ssize_t sock_no_sendpage(struct socket *sock, ++ struct page *page, ++ int offset, size_t size, ++ int flags); ++ ++/* ++ * Default socket callbacks and setup code ++ */ ++ ++extern void sock_def_destruct(struct sock *); ++ ++/* Initialise core socket variables */ ++extern void sock_init_data(struct socket *sock, struct sock *sk); ++ ++extern void sklist_remove_socket(struct sock **list, struct sock *sk); ++extern void sklist_insert_socket(struct sock **list, struct sock *sk); ++extern void sklist_destroy_socket(struct sock **list, struct sock *sk); ++ ++#ifdef CONFIG_FILTER ++ ++/** ++ * sk_filter - run a packet through a socket filter ++ * @sk: sock associated with &sk_buff ++ * @skb: buffer to filter ++ * @needlock: set to 1 if the sock is not locked by caller. ++ * ++ * Run the filter code and then cut skb->data to correct size returned by ++ * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller ++ * than pkt_len we keep whole skb->data. This is the socket level ++ * wrapper to sk_run_filter. It returns 0 if the packet should ++ * be accepted or -EPERM if the packet should be tossed. ++ */ ++ ++static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock) ++{ ++ int err = 0; ++ ++ if (sk->filter) { ++ struct sk_filter *filter; ++ ++ if (needlock) ++ bh_lock_sock(sk); ++ ++ filter = sk->filter; ++ if (filter) { ++ int pkt_len = sk_run_filter(skb, filter->insns, ++ filter->len); ++ if (!pkt_len) ++ err = -EPERM; ++ else ++ skb_trim(skb, pkt_len); ++ } ++ ++ if (needlock) ++ bh_unlock_sock(sk); ++ } ++ return err; ++} ++ ++/** ++ * sk_filter_release: Release a socket filter ++ * @sk: socket ++ * @fp: filter to remove ++ * ++ * Remove a filter from a socket and release its resources. ++ */ ++ ++static inline void sk_filter_release(struct sock *sk, struct sk_filter *fp) ++{ ++ unsigned int size = sk_filter_len(fp); ++ ++ atomic_sub(size, &sk->omem_alloc); ++ ++ if (atomic_dec_and_test(&fp->refcnt)) ++ kfree(fp); ++} ++ ++static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp) ++{ ++ atomic_inc(&fp->refcnt); ++ atomic_add(sk_filter_len(fp), &sk->omem_alloc); ++} ++ ++#else ++ ++static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_FILTER */ ++ ++/* ++ * Socket reference counting postulates. ++ * ++ * * Each user of socket SHOULD hold a reference count. ++ * * Each access point to socket (an hash table bucket, reference from a list, ++ * running timer, skb in flight MUST hold a reference count. ++ * * When reference count hits 0, it means it will never increase back. ++ * * When reference count hits 0, it means that no references from ++ * outside exist to this socket and current process on current CPU ++ * is last user and may/should destroy this socket. ++ * * sk_free is called from any context: process, BH, IRQ. When ++ * it is called, socket has no references from outside -> sk_free ++ * may release descendant resources allocated by the socket, but ++ * to the time when it is called, socket is NOT referenced by any ++ * hash tables, lists etc. ++ * * Packets, delivered from outside (from network or from another process) ++ * and enqueued on receive/error queues SHOULD NOT grab reference count, ++ * when they sit in queue. Otherwise, packets will leak to hole, when ++ * socket is looked up by one cpu and unhasing is made by another CPU. ++ * It is true for udp/raw, netlink (leak to receive and error queues), tcp ++ * (leak to backlog). Packet socket does all the processing inside ++ * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets ++ * use separate SMP lock, so that they are prone too. ++ */ ++ ++/* Grab socket reference count. This operation is valid only ++ when sk is ALREADY grabbed f.e. it is found in hash table ++ or a list and the lookup is made under lock preventing hash table ++ modifications. ++ */ ++ ++static inline void sock_hold(struct sock *sk) ++{ ++ atomic_inc(&sk->refcnt); ++} ++ ++/* Ungrab socket in the context, which assumes that socket refcnt ++ cannot hit zero, f.e. it is true in context of any socketcall. ++ */ ++static inline void __sock_put(struct sock *sk) ++{ ++ atomic_dec(&sk->refcnt); ++} ++ ++/* Ungrab socket and destroy it, if it was the last reference. */ ++static inline void sock_put(struct sock *sk) ++{ ++ if (atomic_dec_and_test(&sk->refcnt)) ++ sk_free(sk); ++} ++ ++/* Detach socket from process context. ++ * Announce socket dead, detach it from wait queue and inode. ++ * Note that parent inode held reference count on this struct sock, ++ * we do not release it in this function, because protocol ++ * probably wants some additional cleanups or even continuing ++ * to work with this socket (TCP). ++ */ ++static inline void sock_orphan(struct sock *sk) ++{ ++ write_lock_bh(&sk->callback_lock); ++ sk->dead = 1; ++ sk->socket = NULL; ++ sk->sleep = NULL; ++ write_unlock_bh(&sk->callback_lock); ++} ++ ++static inline void sock_graft(struct sock *sk, struct socket *parent) ++{ ++ write_lock_bh(&sk->callback_lock); ++ sk->sleep = &parent->wait; ++ parent->sk = sk; ++ sk->socket = parent; ++ write_unlock_bh(&sk->callback_lock); ++} ++ ++static inline int sock_i_uid(struct sock *sk) ++{ ++ int uid; ++ ++ read_lock(&sk->callback_lock); ++ uid = sk->socket ? sk->socket->inode->i_uid : 0; ++ read_unlock(&sk->callback_lock); ++ return uid; ++} ++ ++static inline unsigned long sock_i_ino(struct sock *sk) ++{ ++ unsigned long ino; ++ ++ read_lock(&sk->callback_lock); ++ ino = sk->socket ? sk->socket->inode->i_ino : 0; ++ read_unlock(&sk->callback_lock); ++ return ino; ++} ++ ++static inline struct dst_entry * ++__sk_dst_get(struct sock *sk) ++{ ++ return sk->dst_cache; ++} ++ ++static inline struct dst_entry * ++sk_dst_get(struct sock *sk) ++{ ++ struct dst_entry *dst; ++ ++ read_lock(&sk->dst_lock); ++ dst = sk->dst_cache; ++ if (dst) ++ dst_hold(dst); ++ read_unlock(&sk->dst_lock); ++ return dst; ++} ++ ++static inline void ++__sk_dst_set(struct sock *sk, struct dst_entry *dst) ++{ ++ struct dst_entry *old_dst; ++ ++ old_dst = sk->dst_cache; ++ sk->dst_cache = dst; ++ dst_release(old_dst); ++} ++ ++static inline void ++sk_dst_set(struct sock *sk, struct dst_entry *dst) ++{ ++ write_lock(&sk->dst_lock); ++ __sk_dst_set(sk, dst); ++ write_unlock(&sk->dst_lock); ++} ++ ++static inline void ++__sk_dst_reset(struct sock *sk) ++{ ++ struct dst_entry *old_dst; ++ ++ old_dst = sk->dst_cache; ++ sk->dst_cache = NULL; ++ dst_release(old_dst); ++} ++ ++static inline void ++sk_dst_reset(struct sock *sk) ++{ ++ write_lock(&sk->dst_lock); ++ __sk_dst_reset(sk); ++ write_unlock(&sk->dst_lock); ++} ++ ++static inline struct dst_entry * ++__sk_dst_check(struct sock *sk, u32 cookie) ++{ ++ struct dst_entry *dst = sk->dst_cache; ++ ++ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { ++ sk->dst_cache = NULL; ++ return NULL; ++ } ++ ++ return dst; ++} ++ ++static inline struct dst_entry * ++sk_dst_check(struct sock *sk, u32 cookie) ++{ ++ struct dst_entry *dst = sk_dst_get(sk); ++ ++ if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { ++ sk_dst_reset(sk); ++ return NULL; ++ } ++ ++ return dst; ++} ++ ++ ++/* ++ * Queue a received datagram if it will fit. Stream and sequenced ++ * protocols can't normally use this as they need to fit buffers in ++ * and play with them. ++ * ++ * Inlined as it's very short and called for pretty much every ++ * packet ever received. ++ */ ++ ++static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) ++{ ++ sock_hold(sk); ++ skb->sk = sk; ++ skb->destructor = sock_wfree; ++ atomic_add(skb->truesize, &sk->wmem_alloc); ++} ++ ++static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) ++{ ++ skb->sk = sk; ++ skb->destructor = sock_rfree; ++ atomic_add(skb->truesize, &sk->rmem_alloc); ++} ++ ++static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ int err = 0; ++ int skb_len; ++ ++ /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces ++ number of warnings when compiling with -W --ANK ++ */ ++ if (atomic_read(&sk->rmem_alloc) + skb->truesize >= (unsigned)sk->rcvbuf) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ /* It would be deadlock, if sock_queue_rcv_skb is used ++ with socket lock! We assume that users of this ++ function are lock free. ++ */ ++ err = sk_filter(sk, skb, 1); ++ if (err) ++ goto out; ++ ++ skb->dev = NULL; ++ skb_set_owner_r(skb, sk); ++ ++ /* Cache the SKB length before we tack it onto the receive ++ * queue. Once it is added it no longer belongs to us and ++ * may be freed by other threads of control pulling packets ++ * from the queue. ++ */ ++ skb_len = skb->len; ++ ++ skb_queue_tail(&sk->receive_queue, skb); ++ if (!sk->dead) ++ sk->data_ready(sk,skb_len); ++out: ++ return err; ++} ++ ++static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces ++ number of warnings when compiling with -W --ANK ++ */ ++ if (atomic_read(&sk->rmem_alloc) + skb->truesize >= (unsigned)sk->rcvbuf) ++ return -ENOMEM; ++ skb_set_owner_r(skb, sk); ++ skb_queue_tail(&sk->error_queue,skb); ++ if (!sk->dead) ++ sk->data_ready(sk,skb->len); ++ return 0; ++} ++ ++/* ++ * Recover an error report and clear atomically ++ */ ++ ++static inline int sock_error(struct sock *sk) ++{ ++ int err=xchg(&sk->err,0); ++ return -err; ++} ++ ++static inline unsigned long sock_wspace(struct sock *sk) ++{ ++ int amt = 0; ++ ++ if (!(sk->shutdown & SEND_SHUTDOWN)) { ++ amt = sk->sndbuf - atomic_read(&sk->wmem_alloc); ++ if (amt < 0) ++ amt = 0; ++ } ++ return amt; ++} ++ ++static inline void sk_wake_async(struct sock *sk, int how, int band) ++{ ++ if (sk->socket && sk->socket->fasync_list) ++ sock_wake_async(sk->socket, how, band); ++} ++ ++#define SOCK_MIN_SNDBUF 2048 ++#define SOCK_MIN_RCVBUF 256 ++ ++/* ++ * Default write policy as shown to user space via poll/select/SIGIO ++ */ ++static inline int sock_writeable(struct sock *sk) ++{ ++ return atomic_read(&sk->wmem_alloc) < (sk->sndbuf / 2); ++} ++ ++static inline int gfp_any(void) ++{ ++ return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; ++} ++ ++static inline long sock_rcvtimeo(struct sock *sk, int noblock) ++{ ++ return noblock ? 0 : sk->rcvtimeo; ++} ++ ++static inline long sock_sndtimeo(struct sock *sk, int noblock) ++{ ++ return noblock ? 0 : sk->sndtimeo; ++} ++ ++static inline int sock_rcvlowat(struct sock *sk, int waitall, int len) ++{ ++ return (waitall ? len : min_t(int, sk->rcvlowat, len)) ? : 1; ++} ++ ++/* Alas, with timeout socket operations are not restartable. ++ * Compare this to poll(). ++ */ ++static inline int sock_intr_errno(long timeo) ++{ ++ return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; ++} ++ ++static __inline__ void ++sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) ++{ ++ if (sk->rcvtstamp) ++ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(skb->stamp), &skb->stamp); ++ else ++ sk->stamp = skb->stamp; ++} ++ ++/* ++ * Enable debug/info messages ++ */ ++ ++#if 0 ++#define NETDEBUG(x) do { } while (0) ++#else ++#define NETDEBUG(x) do { x; } while (0) ++#endif ++ ++/* ++ * Macros for sleeping on a socket. Use them like this: ++ * ++ * SOCK_SLEEP_PRE(sk) ++ * if (condition) ++ * schedule(); ++ * SOCK_SLEEP_POST(sk) ++ * ++ */ ++ ++#define SOCK_SLEEP_PRE(sk) { struct task_struct *tsk = current; \ ++ DECLARE_WAITQUEUE(wait, tsk); \ ++ tsk->state = TASK_INTERRUPTIBLE; \ ++ add_wait_queue((sk)->sleep, &wait); \ ++ release_sock(sk); ++ ++#define SOCK_SLEEP_POST(sk) tsk->state = TASK_RUNNING; \ ++ remove_wait_queue((sk)->sleep, &wait); \ ++ lock_sock(sk); \ ++ } ++ ++extern __u32 sysctl_wmem_max; ++extern __u32 sysctl_rmem_max; ++ ++#endif /* _SOCK_H */ +diff --unified --recursive --new-file linux-2.4.30/net/Config.in linux-2.4.30-1-686-smp-ring3/net/Config.in +--- linux-2.4.30/net/Config.in 2005-01-19 15:10:13.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/Config.in 2005-10-22 23:08:28.028051250 +0200 +@@ -15,6 +15,9 @@ + bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG + fi + bool 'Socket Filtering' CONFIG_FILTER ++if [ "$CONFIG_EXPERIMENTAL" = "y" -a "$CONFIG_FILTER" = "y" ]; then ++ source net/ring/Config.in ++fi + tristate 'Unix domain sockets' CONFIG_UNIX + bool 'TCP/IP networking' CONFIG_INET + if [ "$CONFIG_INET" = "y" ]; then +diff --unified --recursive --new-file linux-2.4.30/net/Config.in.ORG linux-2.4.30-1-686-smp-ring3/net/Config.in.ORG +--- linux-2.4.30/net/Config.in.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/Config.in.ORG 2005-10-22 23:08:28.020050750 +0200 +@@ -0,0 +1,107 @@ ++# ++# Network configuration ++# ++mainmenu_option next_comment ++comment 'Networking options' ++tristate 'Packet socket' CONFIG_PACKET ++if [ "$CONFIG_PACKET" != "n" ]; then ++ bool ' Packet socket: mmapped IO' CONFIG_PACKET_MMAP ++fi ++ ++tristate 'Netlink device emulation' CONFIG_NETLINK_DEV ++ ++bool 'Network packet filtering (replaces ipchains)' CONFIG_NETFILTER ++if [ "$CONFIG_NETFILTER" = "y" ]; then ++ bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG ++fi ++bool 'Socket Filtering' CONFIG_FILTER ++tristate 'Unix domain sockets' CONFIG_UNIX ++bool 'TCP/IP networking' CONFIG_INET ++if [ "$CONFIG_INET" = "y" ]; then ++ source net/ipv4/Config.in ++ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++# IPv6 as module will cause a CRASH if you try to unload it ++ tristate ' The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6 ++ if [ "$CONFIG_IPV6" != "n" ]; then ++ source net/ipv6/Config.in ++ fi ++ fi ++ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++ source net/khttpd/Config.in ++ fi ++ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++ source net/sctp/Config.in ++ fi ++fi ++if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++ tristate 'Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)' CONFIG_ATM ++ if [ "$CONFIG_ATM" = "y" -o "$CONFIG_ATM" = "m" ]; then ++ if [ "$CONFIG_INET" = "y" ]; then ++ dep_tristate ' Classical IP over ATM' CONFIG_ATM_CLIP $CONFIG_ATM ++ if [ "$CONFIG_ATM_CLIP" != "n" ]; then ++ bool ' Do NOT send ICMP if no neighbour' CONFIG_ATM_CLIP_NO_ICMP ++ fi ++ fi ++ dep_tristate ' LAN Emulation (LANE) support' CONFIG_ATM_LANE $CONFIG_ATM ++ if [ "$CONFIG_INET" = "y" -a "$CONFIG_ATM_LANE" != "n" ]; then ++ tristate ' Multi-Protocol Over ATM (MPOA) support' CONFIG_ATM_MPOA ++ fi ++ dep_tristate ' RFC1483/2684 Bridged protocols' CONFIG_ATM_BR2684 $CONFIG_ATM ++ if [ "$CONFIG_ATM_BR2684" != "n" ]; then ++ bool ' Per-VC IP filter kludge' CONFIG_ATM_BR2684_IPFILTER ++ fi ++ fi ++fi ++tristate '802.1Q VLAN Support' CONFIG_VLAN_8021Q ++ ++comment ' ' ++tristate 'The IPX protocol' CONFIG_IPX ++if [ "$CONFIG_IPX" != "n" ]; then ++ source net/ipx/Config.in ++fi ++ ++tristate 'Appletalk protocol support' CONFIG_ATALK ++if [ "$CONFIG_ATALK" != "n" ]; then ++ source drivers/net/appletalk/Config.in ++fi ++ ++tristate 'DECnet Support' CONFIG_DECNET ++if [ "$CONFIG_DECNET" != "n" ]; then ++ source net/decnet/Config.in ++fi ++dep_tristate '802.1d Ethernet Bridging' CONFIG_BRIDGE $CONFIG_INET ++if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++ tristate 'CCITT X.25 Packet Layer (EXPERIMENTAL)' CONFIG_X25 ++ tristate 'LAPB Data Link Driver (EXPERIMENTAL)' CONFIG_LAPB ++ bool '802.2 LLC (EXPERIMENTAL)' CONFIG_LLC ++ bool 'Frame Diverter (EXPERIMENTAL)' CONFIG_NET_DIVERT ++# if [ "$CONFIG_LLC" = "y" ]; then ++# bool ' Netbeui (EXPERIMENTAL)' CONFIG_NETBEUI ++# fi ++ if [ "$CONFIG_INET" = "y" ]; then ++ tristate 'Acorn Econet/AUN protocols (EXPERIMENTAL)' CONFIG_ECONET ++ if [ "$CONFIG_ECONET" != "n" ]; then ++ bool ' AUN over UDP' CONFIG_ECONET_AUNUDP ++ bool ' Native Econet' CONFIG_ECONET_NATIVE ++ fi ++ fi ++ tristate 'WAN router' CONFIG_WAN_ROUTER ++ bool 'Fast switching (read help!)' CONFIG_NET_FASTROUTE ++ bool 'Forwarding between high speed interfaces' CONFIG_NET_HW_FLOWCONTROL ++fi ++ ++mainmenu_option next_comment ++comment 'QoS and/or fair queueing' ++bool 'QoS and/or fair queueing' CONFIG_NET_SCHED ++if [ "$CONFIG_NET_SCHED" = "y" ]; then ++ source net/sched/Config.in ++fi ++#bool 'Network code profiler' CONFIG_NET_PROFILE ++endmenu ++ ++mainmenu_option next_comment ++comment 'Network testing' ++dep_tristate 'Packet Generator (USE WITH CAUTION)' CONFIG_NET_PKTGEN $CONFIG_PROC_FS ++endmenu ++ ++endmenu +diff --unified --recursive --new-file linux-2.4.30/net/Makefile linux-2.4.30-1-686-smp-ring3/net/Makefile +--- linux-2.4.30/net/Makefile 2004-08-08 01:26:06.000000000 +0200 ++++ linux-2.4.30-1-686-smp-ring3/net/Makefile 2005-10-22 23:08:27.928045000 +0200 +@@ -7,7 +7,7 @@ + + O_TARGET := network.o + +-mod-subdirs := ipv4/netfilter ipv6/netfilter ipx irda bluetooth atm netlink sched core sctp 802 ++mod-subdirs := ipv4/netfilter ipv6/netfilter ipx irda bluetooth atm netlink sched core sctp 802 ring + export-objs := netsyms.o + + subdir-y := core ethernet +@@ -46,6 +46,7 @@ + subdir-$(CONFIG_DECNET) += decnet + subdir-$(CONFIG_ECONET) += econet + subdir-$(CONFIG_VLAN_8021Q) += 8021q ++subdir-$(CONFIG_RING) += ring + + ifeq ($(CONFIG_NETFILTER),y) + mod-subdirs += ipv4/ipvs +diff --unified --recursive --new-file linux-2.4.30/net/Makefile.ORG linux-2.4.30-1-686-smp-ring3/net/Makefile.ORG +--- linux-2.4.30/net/Makefile.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/Makefile.ORG 2005-10-22 23:08:27.916044250 +0200 +@@ -0,0 +1,61 @@ ++# ++# Makefile for the linux networking. ++# ++# 2 Sep 2000, Christoph Hellwig ++# Rewritten to use lists instead of if-statements. ++# ++ ++O_TARGET := network.o ++ ++mod-subdirs := ipv4/netfilter ipv6/netfilter ipx irda bluetooth atm netlink sched core sctp 802 ++export-objs := netsyms.o ++ ++subdir-y := core ethernet ++subdir-m := ipv4 # hum? ++ ++ ++subdir-$(CONFIG_NET) += 802 sched netlink ++subdir-$(CONFIG_IPV6) += ipv6 ++subdir-$(CONFIG_INET) += ipv4 ++subdir-$(CONFIG_NETFILTER) += ipv4/netfilter ++subdir-$(CONFIG_UNIX) += unix ++subdir-$(CONFIG_IP_SCTP) += sctp ++ ++ifneq ($(CONFIG_IPV6),n) ++ifneq ($(CONFIG_IPV6),) ++subdir-$(CONFIG_NETFILTER) += ipv6/netfilter ++endif ++endif ++ ++subdir-$(CONFIG_KHTTPD) += khttpd ++subdir-$(CONFIG_PACKET) += packet ++subdir-$(CONFIG_NET_SCHED) += sched ++subdir-$(CONFIG_BRIDGE) += bridge ++subdir-$(CONFIG_IPX) += ipx ++subdir-$(CONFIG_ATALK) += appletalk ++subdir-$(CONFIG_WAN_ROUTER) += wanrouter ++subdir-$(CONFIG_X25) += x25 ++subdir-$(CONFIG_LAPB) += lapb ++subdir-$(CONFIG_NETROM) += netrom ++subdir-$(CONFIG_ROSE) += rose ++subdir-$(CONFIG_AX25) += ax25 ++subdir-$(CONFIG_IRDA) += irda ++subdir-$(CONFIG_BLUEZ) += bluetooth ++subdir-$(CONFIG_SUNRPC) += sunrpc ++subdir-$(CONFIG_ATM) += atm ++subdir-$(CONFIG_DECNET) += decnet ++subdir-$(CONFIG_ECONET) += econet ++subdir-$(CONFIG_VLAN_8021Q) += 8021q ++ ++ifeq ($(CONFIG_NETFILTER),y) ++ mod-subdirs += ipv4/ipvs ++ subdir-$(CONFIG_IP_VS) += ipv4/ipvs ++endif ++ ++obj-y := socket.o $(join $(subdir-y), $(patsubst %,/%.o,$(notdir $(subdir-y)))) ++ifeq ($(CONFIG_NET),y) ++obj-$(CONFIG_MODULES) += netsyms.o ++obj-$(CONFIG_SYSCTL) += sysctl_net.o ++endif ++ ++include $(TOPDIR)/Rules.make +diff --unified --recursive --new-file linux-2.4.30/net/core/dev.c linux-2.4.30-1-686-smp-ring3/net/core/dev.c +--- linux-2.4.30/net/core/dev.c 2005-04-04 03:42:20.000000000 +0200 ++++ linux-2.4.30-1-686-smp-ring3/net/core/dev.c 2005-10-22 23:08:27.900043250 +0200 +@@ -104,6 +104,56 @@ + #include /* Note : will define WIRELESS_EXT */ + #include + #endif /* CONFIG_NET_RADIO || CONFIG_NET_PCMCIA_RADIO */ ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ ++/* #define RING_DEBUG */ ++ ++#include ++#include ++ ++static handle_ring_skb ring_handler = NULL; ++ ++handle_ring_skb get_skb_ring_handler() { return(ring_handler); } ++ ++void set_skb_ring_handler(handle_ring_skb the_handler) { ++ ring_handler = the_handler; ++} ++ ++void do_skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb) { ++ if(ring_handler) ++ ring_handler(skb, recv_packet, real_skb); ++} ++ ++/* ******************* */ ++ ++static handle_ring_buffer buffer_ring_handler = NULL; ++ ++handle_ring_buffer get_buffer_ring_handler() { return(buffer_ring_handler); } ++ ++void set_buffer_ring_handler(handle_ring_buffer the_handler) { ++ buffer_ring_handler = the_handler; ++} ++ ++int do_buffer_ring_handler(struct net_device *dev, char *data, int len) { ++ if(buffer_ring_handler) { ++ buffer_ring_handler(dev, data, len); ++ return(1); ++ } else ++ return(0); ++} ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++EXPORT_SYMBOL(get_skb_ring_handler); ++EXPORT_SYMBOL(set_skb_ring_handler); ++EXPORT_SYMBOL(do_skb_ring_handler); ++ ++EXPORT_SYMBOL(get_buffer_ring_handler); ++EXPORT_SYMBOL(set_buffer_ring_handler); ++EXPORT_SYMBOL(do_buffer_ring_handler); ++#endif ++ ++#endif + #ifdef CONFIG_PLIP + extern int plip_init(void); + #endif +@@ -1066,6 +1116,10 @@ + return -ENOMEM; + } + ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler) ring_handler(skb, 0, 1); ++#endif /* CONFIG_RING */ ++ + /* Grab device queue */ + spin_lock_bh(&dev->queue_lock); + q = dev->qdisc; +@@ -1278,6 +1332,13 @@ + struct softnet_data *queue; + unsigned long flags; + ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler && ring_handler(skb, 1, 1)) { ++ /* The packet has been copied into a ring */ ++ return(NET_RX_SUCCESS); ++ } ++#endif /* CONFIG_RING */ ++ + if (skb->stamp.tv_sec == 0) + do_gettimeofday(&skb->stamp); + +@@ -1464,6 +1525,13 @@ + int ret = NET_RX_DROP; + unsigned short type; + ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler && ring_handler(skb, 1, 1)) { ++ /* The packet has been copied into a ring */ ++ return(NET_RX_SUCCESS); ++ } ++#endif /* CONFIG_RING */ ++ + if (skb->stamp.tv_sec == 0) + do_gettimeofday(&skb->stamp); + +diff --unified --recursive --new-file linux-2.4.30/net/core/dev.c.ORG linux-2.4.30-1-686-smp-ring3/net/core/dev.c.ORG +--- linux-2.4.30/net/core/dev.c.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/core/dev.c.ORG 2005-10-22 23:08:27.472016500 +0200 +@@ -0,0 +1,2926 @@ ++/* ++ * NET3 Protocol independent device support routines. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * Derived from the non IP parts of dev.c 1.0.19 ++ * Authors: Ross Biro, ++ * Fred N. van Kempen, ++ * Mark Evans, ++ * ++ * Additional Authors: ++ * Florian la Roche ++ * Alan Cox ++ * David Hinds ++ * Alexey Kuznetsov ++ * Adam Sulmicki ++ * Pekka Riikonen ++ * ++ * Changes: ++ * D.J. Barrow : Fixed bug where dev->refcnt gets set to 2 ++ * if register_netdev gets called before ++ * net_dev_init & also removed a few lines ++ * of code in the process. ++ * Alan Cox : device private ioctl copies fields back. ++ * Alan Cox : Transmit queue code does relevant stunts to ++ * keep the queue safe. ++ * Alan Cox : Fixed double lock. ++ * Alan Cox : Fixed promisc NULL pointer trap ++ * ???????? : Support the full private ioctl range ++ * Alan Cox : Moved ioctl permission check into drivers ++ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI ++ * Alan Cox : 100 backlog just doesn't cut it when ++ * you start doing multicast video 8) ++ * Alan Cox : Rewrote net_bh and list manager. ++ * Alan Cox : Fix ETH_P_ALL echoback lengths. ++ * Alan Cox : Took out transmit every packet pass ++ * Saved a few bytes in the ioctl handler ++ * Alan Cox : Network driver sets packet type before calling netif_rx. Saves ++ * a function call a packet. ++ * Alan Cox : Hashed net_bh() ++ * Richard Kooijman: Timestamp fixes. ++ * Alan Cox : Wrong field in SIOCGIFDSTADDR ++ * Alan Cox : Device lock protection. ++ * Alan Cox : Fixed nasty side effect of device close changes. ++ * Rudi Cilibrasi : Pass the right thing to set_mac_address() ++ * Dave Miller : 32bit quantity for the device lock to make it work out ++ * on a Sparc. ++ * Bjorn Ekwall : Added KERNELD hack. ++ * Alan Cox : Cleaned up the backlog initialise. ++ * Craig Metz : SIOCGIFCONF fix if space for under ++ * 1 device. ++ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there ++ * is no device open function. ++ * Andi Kleen : Fix error reporting for SIOCGIFCONF ++ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF ++ * Cyrus Durgin : Cleaned for KMOD ++ * Adam Sulmicki : Bug Fix : Network Device Unload ++ * A network device unload needs to purge ++ * the backlog queue. ++ * Paul Rusty Russell : SIOCSIFNAME ++ * Pekka Riikonen : Netdev boot-time settings code ++ * Andrew Morton : Make unregister_netdevice wait indefinitely on dev->refcnt ++ * J Hadi Salim : - Backlog queue sampling ++ * - netif_rx() feedback ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if defined(CONFIG_NET_RADIO) || defined(CONFIG_NET_PCMCIA_RADIO) ++#include /* Note : will define WIRELESS_EXT */ ++#include ++#endif /* CONFIG_NET_RADIO || CONFIG_NET_PCMCIA_RADIO */ ++#ifdef CONFIG_PLIP ++extern int plip_init(void); ++#endif ++ ++ ++/* This define, if set, will randomly drop a packet when congestion ++ * is more than moderate. It helps fairness in the multi-interface ++ * case when one of them is a hog, but it kills performance for the ++ * single interface case so it is off now by default. ++ */ ++#undef RAND_LIE ++ ++/* Setting this will sample the queue lengths and thus congestion ++ * via a timer instead of as each packet is received. ++ */ ++#undef OFFLINE_SAMPLE ++ ++NET_PROFILE_DEFINE(dev_queue_xmit) ++NET_PROFILE_DEFINE(softnet_process) ++ ++const char *if_port_text[] = { ++ "unknown", ++ "BNC", ++ "10baseT", ++ "AUI", ++ "100baseT", ++ "100baseTX", ++ "100baseFX" ++}; ++ ++/* ++ * The list of packet types we will receive (as opposed to discard) ++ * and the routines to invoke. ++ * ++ * Why 16. Because with 16 the only overlap we get on a hash of the ++ * low nibble of the protocol value is RARP/SNAP/X.25. ++ * ++ * NOTE: That is no longer true with the addition of VLAN tags. Not ++ * sure which should go first, but I bet it won't make much ++ * difference if we are running VLANs. The good news is that ++ * this protocol won't be in the list unless compiled in, so ++ * the average user (w/out VLANs) will not be adversly affected. ++ * --BLG ++ * ++ * 0800 IP ++ * 8100 802.1Q VLAN ++ * 0001 802.3 ++ * 0002 AX.25 ++ * 0004 802.2 ++ * 8035 RARP ++ * 0005 SNAP ++ * 0805 X.25 ++ * 0806 ARP ++ * 8137 IPX ++ * 0009 Localtalk ++ * 86DD IPv6 ++ */ ++ ++static struct packet_type *ptype_base[16]; /* 16 way hashed list */ ++static struct packet_type *ptype_all = NULL; /* Taps */ ++ ++#ifdef OFFLINE_SAMPLE ++static void sample_queue(unsigned long dummy); ++static struct timer_list samp_timer = { function: sample_queue }; ++#endif ++ ++#ifdef CONFIG_HOTPLUG ++static int net_run_sbin_hotplug(struct net_device *dev, char *action); ++#else ++#define net_run_sbin_hotplug(dev, action) ({ 0; }) ++#endif ++ ++/* ++ * Our notifier list ++ */ ++ ++static struct notifier_block *netdev_chain=NULL; ++ ++/* ++ * Device drivers call our routines to queue packets here. We empty the ++ * queue in the local softnet handler. ++ */ ++struct softnet_data softnet_data[NR_CPUS] __cacheline_aligned; ++ ++#ifdef CONFIG_NET_FASTROUTE ++int netdev_fastroute; ++int netdev_fastroute_obstacles; ++#endif ++ ++ ++/****************************************************************************************** ++ ++ Protocol management and registration routines ++ ++*******************************************************************************************/ ++ ++/* ++ * For efficiency ++ */ ++ ++int netdev_nit=0; ++ ++/* ++ * Add a protocol ID to the list. Now that the input handler is ++ * smarter we can dispense with all the messy stuff that used to be ++ * here. ++ * ++ * BEWARE!!! Protocol handlers, mangling input packets, ++ * MUST BE last in hash buckets and checking protocol handlers ++ * MUST start from promiscous ptype_all chain in net_bh. ++ * It is true now, do not change it. ++ * Explantion follows: if protocol handler, mangling packet, will ++ * be the first on list, it is not able to sense, that packet ++ * is cloned and should be copied-on-write, so that it will ++ * change it and subsequent readers will get broken packet. ++ * --ANK (980803) ++ */ ++ ++/** ++ * dev_add_pack - add packet handler ++ * @pt: packet type declaration ++ * ++ * Add a protocol handler to the networking stack. The passed &packet_type ++ * is linked into kernel lists and may not be freed until it has been ++ * removed from the kernel lists. ++ */ ++ ++void dev_add_pack(struct packet_type *pt) ++{ ++ int hash; ++ ++ br_write_lock_bh(BR_NETPROTO_LOCK); ++ ++#ifdef CONFIG_NET_FASTROUTE ++ /* Hack to detect packet socket */ ++ if ((pt->data) && ((int)(pt->data)!=1)) { ++ netdev_fastroute_obstacles++; ++ dev_clear_fastroute(pt->dev); ++ } ++#endif ++ if (pt->type == htons(ETH_P_ALL)) { ++ netdev_nit++; ++ pt->next=ptype_all; ++ ptype_all=pt; ++ } else { ++ hash=ntohs(pt->type)&15; ++ pt->next = ptype_base[hash]; ++ ptype_base[hash] = pt; ++ } ++ br_write_unlock_bh(BR_NETPROTO_LOCK); ++} ++ ++ ++/** ++ * dev_remove_pack - remove packet handler ++ * @pt: packet type declaration ++ * ++ * Remove a protocol handler that was previously added to the kernel ++ * protocol handlers by dev_add_pack(). The passed &packet_type is removed ++ * from the kernel lists and can be freed or reused once this function ++ * returns. ++ */ ++ ++void dev_remove_pack(struct packet_type *pt) ++{ ++ struct packet_type **pt1; ++ ++ br_write_lock_bh(BR_NETPROTO_LOCK); ++ ++ if (pt->type == htons(ETH_P_ALL)) { ++ netdev_nit--; ++ pt1=&ptype_all; ++ } else { ++ pt1=&ptype_base[ntohs(pt->type)&15]; ++ } ++ ++ for (; (*pt1) != NULL; pt1 = &((*pt1)->next)) { ++ if (pt == (*pt1)) { ++ *pt1 = pt->next; ++#ifdef CONFIG_NET_FASTROUTE ++ if (pt->data) ++ netdev_fastroute_obstacles--; ++#endif ++ br_write_unlock_bh(BR_NETPROTO_LOCK); ++ return; ++ } ++ } ++ br_write_unlock_bh(BR_NETPROTO_LOCK); ++ printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); ++} ++ ++/****************************************************************************** ++ ++ Device Boot-time Settings Routines ++ ++*******************************************************************************/ ++ ++/* Boot time configuration table */ ++static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; ++ ++/** ++ * netdev_boot_setup_add - add new setup entry ++ * @name: name of the device ++ * @map: configured settings for the device ++ * ++ * Adds new setup entry to the dev_boot_setup list. The function ++ * returns 0 on error and 1 on success. This is a generic routine to ++ * all netdevices. ++ */ ++int netdev_boot_setup_add(char *name, struct ifmap *map) ++{ ++ struct netdev_boot_setup *s; ++ int i; ++ ++ s = dev_boot_setup; ++ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { ++ if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { ++ memset(s[i].name, 0, sizeof(s[i].name)); ++ strcpy(s[i].name, name); ++ memcpy(&s[i].map, map, sizeof(s[i].map)); ++ break; ++ } ++ } ++ ++ if (i >= NETDEV_BOOT_SETUP_MAX) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * netdev_boot_setup_check - check boot time settings ++ * @dev: the netdevice ++ * ++ * Check boot time settings for the device. ++ * The found settings are set for the device to be used ++ * later in the device probing. ++ * Returns 0 if no settings found, 1 if they are. ++ */ ++int netdev_boot_setup_check(struct net_device *dev) ++{ ++ struct netdev_boot_setup *s; ++ int i; ++ ++ s = dev_boot_setup; ++ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { ++ if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && ++ !strncmp(dev->name, s[i].name, strlen(s[i].name))) { ++ dev->irq = s[i].map.irq; ++ dev->base_addr = s[i].map.base_addr; ++ dev->mem_start = s[i].map.mem_start; ++ dev->mem_end = s[i].map.mem_end; ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++/* ++ * Saves at boot time configured settings for any netdevice. ++ */ ++int __init netdev_boot_setup(char *str) ++{ ++ int ints[5]; ++ struct ifmap map; ++ ++ str = get_options(str, ARRAY_SIZE(ints), ints); ++ if (!str || !*str) ++ return 0; ++ ++ /* Save settings */ ++ memset(&map, 0, sizeof(map)); ++ if (ints[0] > 0) ++ map.irq = ints[1]; ++ if (ints[0] > 1) ++ map.base_addr = ints[2]; ++ if (ints[0] > 2) ++ map.mem_start = ints[3]; ++ if (ints[0] > 3) ++ map.mem_end = ints[4]; ++ ++ /* Add new entry to the list */ ++ return netdev_boot_setup_add(str, &map); ++} ++ ++__setup("netdev=", netdev_boot_setup); ++ ++/***************************************************************************************** ++ ++ Device Interface Subroutines ++ ++******************************************************************************************/ ++ ++/** ++ * __dev_get_by_name - find a device by its name ++ * @name: name to find ++ * ++ * Find an interface by name. Must be called under RTNL semaphore ++ * or @dev_base_lock. If the name is found a pointer to the device ++ * is returned. If the name is not found then %NULL is returned. The ++ * reference counters are not incremented so the caller must be ++ * careful with locks. ++ */ ++ ++ ++struct net_device *__dev_get_by_name(const char *name) ++{ ++ struct net_device *dev; ++ ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ if (strncmp(dev->name, name, IFNAMSIZ) == 0) ++ return dev; ++ } ++ return NULL; ++} ++ ++/** ++ * dev_get_by_name - find a device by its name ++ * @name: name to find ++ * ++ * Find an interface by name. This can be called from any ++ * context and does its own locking. The returned handle has ++ * the usage count incremented and the caller must use dev_put() to ++ * release it when it is no longer needed. %NULL is returned if no ++ * matching device is found. ++ */ ++ ++struct net_device *dev_get_by_name(const char *name) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_name(name); ++ if (dev) ++ dev_hold(dev); ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/* ++ Return value is changed to int to prevent illegal usage in future. ++ It is still legal to use to check for device existence. ++ ++ User should understand, that the result returned by this function ++ is meaningless, if it was not issued under rtnl semaphore. ++ */ ++ ++/** ++ * dev_get - test if a device exists ++ * @name: name to test for ++ * ++ * Test if a name exists. Returns true if the name is found. In order ++ * to be sure the name is not allocated or removed during the test the ++ * caller must hold the rtnl semaphore. ++ * ++ * This function primarily exists for back compatibility with older ++ * drivers. ++ */ ++ ++int dev_get(const char *name) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_name(name); ++ read_unlock(&dev_base_lock); ++ return dev != NULL; ++} ++ ++/** ++ * __dev_get_by_index - find a device by its ifindex ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns %NULL if the device ++ * is not found or a pointer to the device. The device has not ++ * had its reference counter increased so the caller must be careful ++ * about locking. The caller must hold either the RTNL semaphore ++ * or @dev_base_lock. ++ */ ++ ++struct net_device * __dev_get_by_index(int ifindex) ++{ ++ struct net_device *dev; ++ ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ if (dev->ifindex == ifindex) ++ return dev; ++ } ++ return NULL; ++} ++ ++ ++/** ++ * dev_get_by_index - find a device by its ifindex ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns NULL if the device ++ * is not found or a pointer to the device. The device returned has ++ * had a reference added and the pointer is safe until the user calls ++ * dev_put to indicate they have finished with it. ++ */ ++ ++struct net_device * dev_get_by_index(int ifindex) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_index(ifindex); ++ if (dev) ++ dev_hold(dev); ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/** ++ * dev_getbyhwaddr - find a device by its hardware address ++ * @type: media type of device ++ * @ha: hardware address ++ * ++ * Search for an interface by MAC address. Returns NULL if the device ++ * is not found or a pointer to the device. The caller must hold the ++ * rtnl semaphore. The returned device has not had its ref count increased ++ * and the caller must therefore be careful about locking ++ * ++ * BUGS: ++ * If the API was consistent this would be __dev_get_by_hwaddr ++ */ ++ ++struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) ++{ ++ struct net_device *dev; ++ ++ ASSERT_RTNL(); ++ ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ if (dev->type == type && ++ memcmp(dev->dev_addr, ha, dev->addr_len) == 0) ++ return dev; ++ } ++ return NULL; ++} ++ ++/** ++ * dev_get_by_flags - find any device with given flags ++ * @if_flags: IFF_* values ++ * @mask: bitmask of bits in if_flags to check ++ * ++ * Search for any interface with the given flags. Returns NULL if a device ++ * is not found or a pointer to the device. The device returned has ++ * had a reference added and the pointer is safe until the user calls ++ * dev_put to indicate they have finished with it. ++ */ ++ ++struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_flags(if_flags, mask); ++ if (dev) ++ dev_hold(dev); ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/** ++ * __dev_get_by_flags - find any device with given flags ++ * @if_flags: IFF_* values ++ * @mask: bitmask of bits in if_flags to check ++ * ++ * Search for any interface with the given flags. Returns NULL if a device ++ * is not found or a pointer to the device. The caller must hold either ++ * the RTNL semaphore or @dev_base_lock. ++ */ ++ ++struct net_device *__dev_get_by_flags(unsigned short if_flags, unsigned short mask) ++{ ++ struct net_device *dev; ++ ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ if (((dev->flags ^ if_flags) & mask) == 0) ++ return dev; ++ } ++ return NULL; ++} ++ ++/** ++ * dev_alloc_name - allocate a name for a device ++ * @dev: device ++ * @name: name format string ++ * ++ * Passed a format string - eg "lt%d" it will try and find a suitable ++ * id. Not efficient for many devices, not called a lot. The caller ++ * must hold the dev_base or rtnl lock while allocating the name and ++ * adding the device in order to avoid duplicates. Returns the number ++ * of the unit assigned or a negative errno code. ++ */ ++ ++int dev_alloc_name(struct net_device *dev, const char *name) ++{ ++ int i; ++ char buf[32]; ++ char *p; ++ ++ /* ++ * Verify the string as this thing may have come from ++ * the user. There must be either one "%d" and no other "%" ++ * characters, or no "%" characters at all. ++ */ ++ p = strchr(name, '%'); ++ if (p && (p[1] != 'd' || strchr(p+2, '%'))) ++ return -EINVAL; ++ ++ /* ++ * If you need over 100 please also fix the algorithm... ++ */ ++ for (i = 0; i < 100; i++) { ++ snprintf(buf,sizeof(buf),name,i); ++ if (__dev_get_by_name(buf) == NULL) { ++ strcpy(dev->name, buf); ++ return i; ++ } ++ } ++ return -ENFILE; /* Over 100 of the things .. bail out! */ ++} ++ ++/** ++ * dev_alloc - allocate a network device and name ++ * @name: name format string ++ * @err: error return pointer ++ * ++ * Passed a format string, eg. "lt%d", it will allocate a network device ++ * and space for the name. %NULL is returned if no memory is available. ++ * If the allocation succeeds then the name is assigned and the ++ * device pointer returned. %NULL is returned if the name allocation ++ * failed. The cause of an error is returned as a negative errno code ++ * in the variable @err points to. ++ * ++ * The caller must hold the @dev_base or RTNL locks when doing this in ++ * order to avoid duplicate name allocations. ++ */ ++ ++struct net_device *dev_alloc(const char *name, int *err) ++{ ++ struct net_device *dev=kmalloc(sizeof(struct net_device), GFP_KERNEL); ++ if (dev == NULL) { ++ *err = -ENOBUFS; ++ return NULL; ++ } ++ memset(dev, 0, sizeof(struct net_device)); ++ *err = dev_alloc_name(dev, name); ++ if (*err < 0) { ++ kfree(dev); ++ return NULL; ++ } ++ return dev; ++} ++ ++/** ++ * netdev_state_change - device changes state ++ * @dev: device to cause notification ++ * ++ * Called to indicate a device has changed state. This function calls ++ * the notifier chains for netdev_chain and sends a NEWLINK message ++ * to the routing socket. ++ */ ++ ++void netdev_state_change(struct net_device *dev) ++{ ++ if (dev->flags&IFF_UP) { ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); ++ rtmsg_ifinfo(RTM_NEWLINK, dev, 0); ++ } ++} ++ ++ ++#ifdef CONFIG_KMOD ++ ++/** ++ * dev_load - load a network module ++ * @name: name of interface ++ * ++ * If a network interface is not present and the process has suitable ++ * privileges this function loads the module. If module loading is not ++ * available in this kernel then it becomes a nop. ++ */ ++ ++void dev_load(const char *name) ++{ ++ if (!dev_get(name) && capable(CAP_SYS_MODULE)) ++ request_module(name); ++} ++ ++#else ++ ++extern inline void dev_load(const char *unused){;} ++ ++#endif ++ ++static int default_rebuild_header(struct sk_buff *skb) ++{ ++ printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", skb->dev ? skb->dev->name : "NULL!!!"); ++ kfree_skb(skb); ++ return 1; ++} ++ ++/** ++ * dev_open - prepare an interface for use. ++ * @dev: device to open ++ * ++ * Takes a device from down to up state. The device's private open ++ * function is invoked and then the multicast lists are loaded. Finally ++ * the device is moved into the up state and a %NETDEV_UP message is ++ * sent to the netdev notifier chain. ++ * ++ * Calling this function on an active interface is a nop. On a failure ++ * a negative errno code is returned. ++ */ ++ ++int dev_open(struct net_device *dev) ++{ ++ int ret = 0; ++ ++ /* ++ * Is it already up? ++ */ ++ ++ if (dev->flags&IFF_UP) ++ return 0; ++ ++ /* ++ * Is it even present? ++ */ ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ ++ /* ++ * Call device private open method ++ */ ++ if (try_inc_mod_count(dev->owner)) { ++ set_bit(__LINK_STATE_START, &dev->state); ++ if (dev->open) { ++ ret = dev->open(dev); ++ if (ret != 0) { ++ clear_bit(__LINK_STATE_START, &dev->state); ++ if (dev->owner) ++ __MOD_DEC_USE_COUNT(dev->owner); ++ } ++ } ++ } else { ++ ret = -ENODEV; ++ } ++ ++ /* ++ * If it went open OK then: ++ */ ++ ++ if (ret == 0) ++ { ++ /* ++ * Set the flags. ++ */ ++ dev->flags |= IFF_UP; ++ ++ /* ++ * Initialize multicasting status ++ */ ++ dev_mc_upload(dev); ++ ++ /* ++ * Wakeup transmit queue engine ++ */ ++ dev_activate(dev); ++ ++ /* ++ * ... and announce new interface. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_UP, dev); ++ } ++ return(ret); ++} ++ ++#ifdef CONFIG_NET_FASTROUTE ++ ++static void dev_do_clear_fastroute(struct net_device *dev) ++{ ++ if (dev->accept_fastpath) { ++ int i; ++ ++ for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) { ++ struct dst_entry *dst; ++ ++ write_lock_irq(&dev->fastpath_lock); ++ dst = dev->fastpath[i]; ++ dev->fastpath[i] = NULL; ++ write_unlock_irq(&dev->fastpath_lock); ++ ++ dst_release(dst); ++ } ++ } ++} ++ ++void dev_clear_fastroute(struct net_device *dev) ++{ ++ if (dev) { ++ dev_do_clear_fastroute(dev); ++ } else { ++ read_lock(&dev_base_lock); ++ for (dev = dev_base; dev; dev = dev->next) ++ dev_do_clear_fastroute(dev); ++ read_unlock(&dev_base_lock); ++ } ++} ++#endif ++ ++/** ++ * dev_close - shutdown an interface. ++ * @dev: device to shutdown ++ * ++ * This function moves an active device into down state. A ++ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device ++ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier ++ * chain. ++ */ ++ ++int dev_close(struct net_device *dev) ++{ ++ if (!(dev->flags&IFF_UP)) ++ return 0; ++ ++ /* ++ * Tell people we are going down, so that they can ++ * prepare to death, when device is still operating. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); ++ ++ dev_deactivate(dev); ++ ++ clear_bit(__LINK_STATE_START, &dev->state); ++ ++ /* Synchronize to scheduled poll. We cannot touch poll list, ++ * it can be even on different cpu. So just clear netif_running(), ++ * and wait when poll really will happen. Actually, the best place ++ * for this is inside dev->stop() after device stopped its irq ++ * engine, but this requires more changes in devices. */ ++ ++ smp_mb__after_clear_bit(); /* Commit netif_running(). */ ++ while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { ++ /* No hurry. */ ++ current->state = TASK_INTERRUPTIBLE; ++ schedule_timeout(1); ++ } ++ ++ /* ++ * Call the device specific close. This cannot fail. ++ * Only if device is UP ++ * ++ * We allow it to be called even after a DETACH hot-plug ++ * event. ++ */ ++ ++ if (dev->stop) ++ dev->stop(dev); ++ ++ /* ++ * Device is now down. ++ */ ++ ++ dev->flags &= ~IFF_UP; ++#ifdef CONFIG_NET_FASTROUTE ++ dev_clear_fastroute(dev); ++#endif ++ ++ /* ++ * Tell people we are down ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); ++ ++ /* ++ * Drop the module refcount ++ */ ++ if (dev->owner) ++ __MOD_DEC_USE_COUNT(dev->owner); ++ ++ return(0); ++} ++ ++ ++/* ++ * Device change register/unregister. These are not inline or static ++ * as we export them to the world. ++ */ ++ ++/** ++ * register_netdevice_notifier - register a network notifier block ++ * @nb: notifier ++ * ++ * Register a notifier to be called when network device events occur. ++ * The notifier passed is linked into the kernel structures and must ++ * not be reused until it has been unregistered. A negative errno code ++ * is returned on a failure. ++ */ ++ ++int register_netdevice_notifier(struct notifier_block *nb) ++{ ++ return notifier_chain_register(&netdev_chain, nb); ++} ++ ++/** ++ * unregister_netdevice_notifier - unregister a network notifier block ++ * @nb: notifier ++ * ++ * Unregister a notifier previously registered by ++ * register_netdevice_notifier(). The notifier is unlinked into the ++ * kernel structures and may then be reused. A negative errno code ++ * is returned on a failure. ++ */ ++ ++int unregister_netdevice_notifier(struct notifier_block *nb) ++{ ++ return notifier_chain_unregister(&netdev_chain,nb); ++} ++ ++/* ++ * Support routine. Sends outgoing frames to any network ++ * taps currently in use. ++ */ ++ ++void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct packet_type *ptype; ++ do_gettimeofday(&skb->stamp); ++ ++ br_read_lock(BR_NETPROTO_LOCK); ++ for (ptype = ptype_all; ptype!=NULL; ptype = ptype->next) ++ { ++ /* Never send packets back to the socket ++ * they originated from - MvS (miquels@drinkel.ow.org) ++ */ ++ if ((ptype->dev == dev || !ptype->dev) && ++ ((struct sock *)ptype->data != skb->sk)) ++ { ++ struct sk_buff *skb2; ++ if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) ++ break; ++ ++ /* skb->nh should be correctly ++ set by sender, so that the second statement is ++ just protection against buggy protocols. ++ */ ++ skb2->mac.raw = skb2->data; ++ ++ if (skb2->nh.raw < skb2->data || skb2->nh.raw > skb2->tail) { ++ if (net_ratelimit()) ++ printk(KERN_CRIT "protocol %04x is buggy, dev %s\n", skb2->protocol, dev->name); ++ skb2->nh.raw = skb2->data; ++ } ++ ++ skb2->h.raw = skb2->nh.raw; ++ skb2->pkt_type = PACKET_OUTGOING; ++ ptype->func(skb2, skb->dev, ptype); ++ } ++ } ++ br_read_unlock(BR_NETPROTO_LOCK); ++} ++ ++/* Calculate csum in the case, when packet is misrouted. ++ * If it failed by some reason, ignore and send skb with wrong ++ * checksum. ++ */ ++struct sk_buff * skb_checksum_help(struct sk_buff *skb) ++{ ++ int offset; ++ unsigned int csum; ++ ++ offset = skb->h.raw - skb->data; ++ if (offset > (int)skb->len) ++ BUG(); ++ csum = skb_checksum(skb, offset, skb->len-offset, 0); ++ ++ offset = skb->tail - skb->h.raw; ++ if (offset <= 0) ++ BUG(); ++ if (skb->csum+2 > offset) ++ BUG(); ++ ++ *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); ++ skb->ip_summed = CHECKSUM_NONE; ++ return skb; ++} ++ ++#ifdef CONFIG_HIGHMEM ++/* Actually, we should eliminate this check as soon as we know, that: ++ * 1. IOMMU is present and allows to map all the memory. ++ * 2. No high memory really exists on this machine. ++ */ ++ ++static inline int ++illegal_highdma(struct net_device *dev, struct sk_buff *skb) ++{ ++ int i; ++ ++ if (dev->features&NETIF_F_HIGHDMA) ++ return 0; ++ ++ for (i=0; inr_frags; i++) ++ if (skb_shinfo(skb)->frags[i].page >= highmem_start_page) ++ return 1; ++ ++ return 0; ++} ++#else ++#define illegal_highdma(dev, skb) (0) ++#endif ++ ++/** ++ * dev_queue_xmit - transmit a buffer ++ * @skb: buffer to transmit ++ * ++ * Queue a buffer for transmission to a network device. The caller must ++ * have set the device and priority and built the buffer before calling this ++ * function. The function can be called from an interrupt. ++ * ++ * A negative errno code is returned on a failure. A success does not ++ * guarantee the frame will be transmitted as it may be dropped due ++ * to congestion or traffic shaping. ++ */ ++ ++int dev_queue_xmit(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ struct Qdisc *q; ++ ++ if (skb_shinfo(skb)->frag_list && ++ !(dev->features&NETIF_F_FRAGLIST) && ++ skb_linearize(skb, GFP_ATOMIC) != 0) { ++ kfree_skb(skb); ++ return -ENOMEM; ++ } ++ ++ /* Fragmented skb is linearized if device does not support SG, ++ * or if at least one of fragments is in highmem and device ++ * does not support DMA from it. ++ */ ++ if (skb_shinfo(skb)->nr_frags && ++ (!(dev->features&NETIF_F_SG) || illegal_highdma(dev, skb)) && ++ skb_linearize(skb, GFP_ATOMIC) != 0) { ++ kfree_skb(skb); ++ return -ENOMEM; ++ } ++ ++ /* If packet is not checksummed and device does not support ++ * checksumming for this protocol, complete checksumming here. ++ */ ++ if (skb->ip_summed == CHECKSUM_HW && ++ (!(dev->features&(NETIF_F_HW_CSUM|NETIF_F_NO_CSUM)) && ++ (!(dev->features&NETIF_F_IP_CSUM) || ++ skb->protocol != htons(ETH_P_IP)))) { ++ if ((skb = skb_checksum_help(skb)) == NULL) ++ return -ENOMEM; ++ } ++ ++ /* Grab device queue */ ++ spin_lock_bh(&dev->queue_lock); ++ q = dev->qdisc; ++ if (q->enqueue) { ++ int ret = q->enqueue(skb, q); ++ ++ qdisc_run(dev); ++ ++ spin_unlock_bh(&dev->queue_lock); ++ return ret == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : ret; ++ } ++ ++ /* The device has no queue. Common case for software devices: ++ loopback, all the sorts of tunnels... ++ ++ Really, it is unlikely that xmit_lock protection is necessary here. ++ (f.e. loopback and IP tunnels are clean ignoring statistics counters.) ++ However, it is possible, that they rely on protection ++ made by us here. ++ ++ Check this and shot the lock. It is not prone from deadlocks. ++ Either shot noqueue qdisc, it is even simpler 8) ++ */ ++ if (dev->flags&IFF_UP) { ++ int cpu = smp_processor_id(); ++ ++ if (dev->xmit_lock_owner != cpu) { ++ spin_unlock(&dev->queue_lock); ++ spin_lock(&dev->xmit_lock); ++ dev->xmit_lock_owner = cpu; ++ ++ if (!netif_queue_stopped(dev)) { ++ if (netdev_nit) ++ dev_queue_xmit_nit(skb,dev); ++ ++ if (dev->hard_start_xmit(skb, dev) == 0) { ++ dev->xmit_lock_owner = -1; ++ spin_unlock_bh(&dev->xmit_lock); ++ return 0; ++ } ++ } ++ dev->xmit_lock_owner = -1; ++ spin_unlock_bh(&dev->xmit_lock); ++ if (net_ratelimit()) ++ printk(KERN_CRIT "Virtual device %s asks to queue packet!\n", dev->name); ++ kfree_skb(skb); ++ return -ENETDOWN; ++ } else { ++ /* Recursion is detected! It is possible, unfortunately */ ++ if (net_ratelimit()) ++ printk(KERN_CRIT "Dead loop on virtual device %s, fix it urgently!\n", dev->name); ++ } ++ } ++ spin_unlock_bh(&dev->queue_lock); ++ ++ kfree_skb(skb); ++ return -ENETDOWN; ++} ++ ++ ++/*======================================================================= ++ Receiver routines ++ =======================================================================*/ ++ ++int netdev_max_backlog = 300; ++int weight_p = 64; /* old backlog weight */ ++/* These numbers are selected based on intuition and some ++ * experimentatiom, if you have more scientific way of doing this ++ * please go ahead and fix things. ++ */ ++int no_cong_thresh = 10; ++int no_cong = 20; ++int lo_cong = 100; ++int mod_cong = 290; ++ ++struct netif_rx_stats netdev_rx_stat[NR_CPUS]; ++ ++ ++#ifdef CONFIG_NET_HW_FLOWCONTROL ++atomic_t netdev_dropping = ATOMIC_INIT(0); ++static unsigned long netdev_fc_mask = 1; ++unsigned long netdev_fc_xoff = 0; ++spinlock_t netdev_fc_lock = SPIN_LOCK_UNLOCKED; ++ ++static struct ++{ ++ void (*stimul)(struct net_device *); ++ struct net_device *dev; ++} netdev_fc_slots[BITS_PER_LONG]; ++ ++int netdev_register_fc(struct net_device *dev, void (*stimul)(struct net_device *dev)) ++{ ++ int bit = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&netdev_fc_lock, flags); ++ if (netdev_fc_mask != ~0UL) { ++ bit = ffz(netdev_fc_mask); ++ netdev_fc_slots[bit].stimul = stimul; ++ netdev_fc_slots[bit].dev = dev; ++ set_bit(bit, &netdev_fc_mask); ++ clear_bit(bit, &netdev_fc_xoff); ++ } ++ spin_unlock_irqrestore(&netdev_fc_lock, flags); ++ return bit; ++} ++ ++void netdev_unregister_fc(int bit) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&netdev_fc_lock, flags); ++ if (bit > 0) { ++ netdev_fc_slots[bit].stimul = NULL; ++ netdev_fc_slots[bit].dev = NULL; ++ clear_bit(bit, &netdev_fc_mask); ++ clear_bit(bit, &netdev_fc_xoff); ++ } ++ spin_unlock_irqrestore(&netdev_fc_lock, flags); ++} ++ ++static void netdev_wakeup(void) ++{ ++ unsigned long xoff; ++ ++ spin_lock(&netdev_fc_lock); ++ xoff = netdev_fc_xoff; ++ netdev_fc_xoff = 0; ++ while (xoff) { ++ int i = ffz(~xoff); ++ xoff &= ~(1<> 1)+ (blog >> 1); ++ ++ if (avg_blog > mod_cong) { ++ /* Above moderate congestion levels. */ ++ softnet_data[cpu].cng_level = NET_RX_CN_HIGH; ++#ifdef RAND_LIE ++ rd = net_random(); ++ rq = rd % netdev_max_backlog; ++ if (rq < avg_blog) /* unlucky bastard */ ++ softnet_data[cpu].cng_level = NET_RX_DROP; ++#endif ++ } else if (avg_blog > lo_cong) { ++ softnet_data[cpu].cng_level = NET_RX_CN_MOD; ++#ifdef RAND_LIE ++ rd = net_random(); ++ rq = rd % netdev_max_backlog; ++ if (rq < avg_blog) /* unlucky bastard */ ++ softnet_data[cpu].cng_level = NET_RX_CN_HIGH; ++#endif ++ } else if (avg_blog > no_cong) ++ softnet_data[cpu].cng_level = NET_RX_CN_LOW; ++ else /* no congestion */ ++ softnet_data[cpu].cng_level = NET_RX_SUCCESS; ++ ++ softnet_data[cpu].avg_blog = avg_blog; ++} ++ ++#ifdef OFFLINE_SAMPLE ++static void sample_queue(unsigned long dummy) ++{ ++/* 10 ms 0r 1ms -- i dont care -- JHS */ ++ int next_tick = 1; ++ int cpu = smp_processor_id(); ++ ++ get_sample_stats(cpu); ++ next_tick += jiffies; ++ mod_timer(&samp_timer, next_tick); ++} ++#endif ++ ++ ++/** ++ * netif_rx - post buffer to the network code ++ * @skb: buffer to post ++ * ++ * This function receives a packet from a device driver and queues it for ++ * the upper (protocol) levels to process. It always succeeds. The buffer ++ * may be dropped during processing for congestion control or by the ++ * protocol layers. ++ * ++ * return values: ++ * NET_RX_SUCCESS (no congestion) ++ * NET_RX_CN_LOW (low congestion) ++ * NET_RX_CN_MOD (moderate congestion) ++ * NET_RX_CN_HIGH (high congestion) ++ * NET_RX_DROP (packet was dropped) ++ * ++ * ++ */ ++ ++int netif_rx(struct sk_buff *skb) ++{ ++ int this_cpu = smp_processor_id(); ++ struct softnet_data *queue; ++ unsigned long flags; ++ ++ if (skb->stamp.tv_sec == 0) ++ do_gettimeofday(&skb->stamp); ++ ++ /* The code is rearranged so that the path is the most ++ short when CPU is congested, but is still operating. ++ */ ++ queue = &softnet_data[this_cpu]; ++ ++ local_irq_save(flags); ++ ++ netdev_rx_stat[this_cpu].total++; ++ if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { ++ if (queue->input_pkt_queue.qlen) { ++ if (queue->throttle) ++ goto drop; ++ ++enqueue: ++ dev_hold(skb->dev); ++ __skb_queue_tail(&queue->input_pkt_queue,skb); ++ local_irq_restore(flags); ++#ifndef OFFLINE_SAMPLE ++ get_sample_stats(this_cpu); ++#endif ++ return queue->cng_level; ++ } ++ ++ if (queue->throttle) { ++ queue->throttle = 0; ++#ifdef CONFIG_NET_HW_FLOWCONTROL ++ if (atomic_dec_and_test(&netdev_dropping)) ++ netdev_wakeup(); ++#endif ++ } ++ ++ netif_rx_schedule(&queue->blog_dev); ++ goto enqueue; ++ } ++ ++ if (queue->throttle == 0) { ++ queue->throttle = 1; ++ netdev_rx_stat[this_cpu].throttled++; ++#ifdef CONFIG_NET_HW_FLOWCONTROL ++ atomic_inc(&netdev_dropping); ++#endif ++ } ++ ++drop: ++ netdev_rx_stat[this_cpu].dropped++; ++ local_irq_restore(flags); ++ ++ kfree_skb(skb); ++ return NET_RX_DROP; ++} ++ ++/* Deliver skb to an old protocol, which is not threaded well ++ or which do not understand shared skbs. ++ */ ++static int deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int last) ++{ ++ static spinlock_t net_bh_lock = SPIN_LOCK_UNLOCKED; ++ int ret = NET_RX_DROP; ++ ++ ++ if (!last) { ++ skb = skb_clone(skb, GFP_ATOMIC); ++ if (skb == NULL) ++ return ret; ++ } ++ if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) { ++ kfree_skb(skb); ++ return ret; ++ } ++ ++ /* The assumption (correct one) is that old protocols ++ did not depened on BHs different of NET_BH and TIMER_BH. ++ */ ++ ++ /* Emulate NET_BH with special spinlock */ ++ spin_lock(&net_bh_lock); ++ ++ /* Disable timers and wait for all timers completion */ ++ tasklet_disable(bh_task_vec+TIMER_BH); ++ ++ ret = pt->func(skb, skb->dev, pt); ++ ++ tasklet_hi_enable(bh_task_vec+TIMER_BH); ++ spin_unlock(&net_bh_lock); ++ return ret; ++} ++ ++static __inline__ void skb_bond(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ ++ if (dev->master) { ++ skb->real_dev = skb->dev; ++ skb->dev = dev->master; ++ } ++} ++ ++static void net_tx_action(struct softirq_action *h) ++{ ++ int cpu = smp_processor_id(); ++ ++ if (softnet_data[cpu].completion_queue) { ++ struct sk_buff *clist; ++ ++ local_irq_disable(); ++ clist = softnet_data[cpu].completion_queue; ++ softnet_data[cpu].completion_queue = NULL; ++ local_irq_enable(); ++ ++ while (clist != NULL) { ++ struct sk_buff *skb = clist; ++ clist = clist->next; ++ ++ BUG_TRAP(atomic_read(&skb->users) == 0); ++ __kfree_skb(skb); ++ } ++ } ++ ++ if (softnet_data[cpu].output_queue) { ++ struct net_device *head; ++ ++ local_irq_disable(); ++ head = softnet_data[cpu].output_queue; ++ softnet_data[cpu].output_queue = NULL; ++ local_irq_enable(); ++ ++ while (head != NULL) { ++ struct net_device *dev = head; ++ head = head->next_sched; ++ ++ smp_mb__before_clear_bit(); ++ clear_bit(__LINK_STATE_SCHED, &dev->state); ++ ++ if (spin_trylock(&dev->queue_lock)) { ++ qdisc_run(dev); ++ spin_unlock(&dev->queue_lock); ++ } else { ++ netif_schedule(dev); ++ } ++ } ++ } ++} ++ ++ ++#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) ++void (*br_handle_frame_hook)(struct sk_buff *skb) = NULL; ++#endif ++ ++static __inline__ int handle_bridge(struct sk_buff *skb, ++ struct packet_type *pt_prev) ++{ ++ int ret = NET_RX_DROP; ++ ++ if (pt_prev) { ++ if (!pt_prev->data) ++ ret = deliver_to_old_ones(pt_prev, skb, 0); ++ else { ++ atomic_inc(&skb->users); ++ ret = pt_prev->func(skb, skb->dev, pt_prev); ++ } ++ } ++ ++ br_handle_frame_hook(skb); ++ return ret; ++} ++ ++ ++#ifdef CONFIG_NET_DIVERT ++static inline int handle_diverter(struct sk_buff *skb) ++{ ++ /* if diversion is supported on device, then divert */ ++ if (skb->dev->divert && skb->dev->divert->divert) ++ divert_frame(skb); ++ return 0; ++} ++#endif /* CONFIG_NET_DIVERT */ ++ ++int netif_receive_skb(struct sk_buff *skb) ++{ ++ struct packet_type *ptype, *pt_prev; ++ int ret = NET_RX_DROP; ++ unsigned short type; ++ ++ if (skb->stamp.tv_sec == 0) ++ do_gettimeofday(&skb->stamp); ++ ++ skb_bond(skb); ++ ++ netdev_rx_stat[smp_processor_id()].total++; ++ ++#ifdef CONFIG_NET_FASTROUTE ++ if (skb->pkt_type == PACKET_FASTROUTE) { ++ netdev_rx_stat[smp_processor_id()].fastroute_deferred_out++; ++ return dev_queue_xmit(skb); ++ } ++#endif ++ ++ skb->h.raw = skb->nh.raw = skb->data; ++ ++ pt_prev = NULL; ++ for (ptype = ptype_all; ptype; ptype = ptype->next) { ++ if (!ptype->dev || ptype->dev == skb->dev) { ++ if (pt_prev) { ++ if (!pt_prev->data) { ++ ret = deliver_to_old_ones(pt_prev, skb, 0); ++ } else { ++ atomic_inc(&skb->users); ++ ret = pt_prev->func(skb, skb->dev, pt_prev); ++ } ++ } ++ pt_prev = ptype; ++ } ++ } ++ ++#ifdef CONFIG_NET_DIVERT ++ if (skb->dev->divert && skb->dev->divert->divert) ++ ret = handle_diverter(skb); ++#endif /* CONFIG_NET_DIVERT */ ++ ++#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) ++ if (skb->dev->br_port != NULL && br_handle_frame_hook != NULL && ++ skb->pkt_type != PACKET_LOOPBACK) { ++ return handle_bridge(skb, pt_prev); ++ } ++#endif ++ ++ type = skb->protocol; ++ for (ptype=ptype_base[ntohs(type)&15];ptype;ptype=ptype->next) { ++ if (ptype->type == type && ++ (!ptype->dev || ptype->dev == skb->dev)) { ++ if (pt_prev) { ++ if (!pt_prev->data) { ++ ret = deliver_to_old_ones(pt_prev, skb, 0); ++ } else { ++ atomic_inc(&skb->users); ++ ret = pt_prev->func(skb, skb->dev, pt_prev); ++ } ++ } ++ pt_prev = ptype; ++ } ++ } ++ ++ if (pt_prev) { ++ if (!pt_prev->data) { ++ ret = deliver_to_old_ones(pt_prev, skb, 1); ++ } else { ++ ret = pt_prev->func(skb, skb->dev, pt_prev); ++ } ++ } else { ++ kfree_skb(skb); ++ /* Jamal, now you will not able to escape explaining ++ * me how you were going to use this. :-) ++ */ ++ ret = NET_RX_DROP; ++ } ++ ++ return ret; ++} ++ ++static int process_backlog(struct net_device *backlog_dev, int *budget) ++{ ++ int work = 0; ++ int quota = min(backlog_dev->quota, *budget); ++ int this_cpu = smp_processor_id(); ++ struct softnet_data *queue = &softnet_data[this_cpu]; ++ unsigned long start_time = jiffies; ++ ++ for (;;) { ++ struct sk_buff *skb; ++ struct net_device *dev; ++ ++ local_irq_disable(); ++ skb = __skb_dequeue(&queue->input_pkt_queue); ++ if (skb == NULL) ++ goto job_done; ++ local_irq_enable(); ++ ++ dev = skb->dev; ++ ++ netif_receive_skb(skb); ++ ++ dev_put(dev); ++ ++ work++; ++ ++ if (work >= quota || jiffies - start_time > 1) ++ break; ++ ++#ifdef CONFIG_NET_HW_FLOWCONTROL ++ if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) { ++ queue->throttle = 0; ++ if (atomic_dec_and_test(&netdev_dropping)) { ++ netdev_wakeup(); ++ break; ++ } ++ } ++#endif ++ } ++ ++ backlog_dev->quota -= work; ++ *budget -= work; ++ return -1; ++ ++job_done: ++ backlog_dev->quota -= work; ++ *budget -= work; ++ ++ list_del(&backlog_dev->poll_list); ++ smp_mb__before_clear_bit(); ++ netif_poll_enable(backlog_dev); ++ ++ if (queue->throttle) { ++ queue->throttle = 0; ++#ifdef CONFIG_NET_HW_FLOWCONTROL ++ if (atomic_dec_and_test(&netdev_dropping)) ++ netdev_wakeup(); ++#endif ++ } ++ local_irq_enable(); ++ return 0; ++} ++ ++static void net_rx_action(struct softirq_action *h) ++{ ++ int this_cpu = smp_processor_id(); ++ struct softnet_data *queue = &softnet_data[this_cpu]; ++ unsigned long start_time = jiffies; ++ int budget = netdev_max_backlog; ++ ++ br_read_lock(BR_NETPROTO_LOCK); ++ local_irq_disable(); ++ ++ while (!list_empty(&queue->poll_list)) { ++ struct net_device *dev; ++ ++ if (budget <= 0 || jiffies - start_time > 1) ++ goto softnet_break; ++ ++ local_irq_enable(); ++ ++ dev = list_entry(queue->poll_list.next, struct net_device, poll_list); ++ ++ if (dev->quota <= 0 || dev->poll(dev, &budget)) { ++ local_irq_disable(); ++ list_del(&dev->poll_list); ++ list_add_tail(&dev->poll_list, &queue->poll_list); ++ if (dev->quota < 0) ++ dev->quota += dev->weight; ++ else ++ dev->quota = dev->weight; ++ } else { ++ dev_put(dev); ++ local_irq_disable(); ++ } ++ } ++ ++ local_irq_enable(); ++ br_read_unlock(BR_NETPROTO_LOCK); ++ return; ++ ++softnet_break: ++ netdev_rx_stat[this_cpu].time_squeeze++; ++ __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ); ++ ++ local_irq_enable(); ++ br_read_unlock(BR_NETPROTO_LOCK); ++} ++ ++static gifconf_func_t * gifconf_list [NPROTO]; ++ ++/** ++ * register_gifconf - register a SIOCGIF handler ++ * @family: Address family ++ * @gifconf: Function handler ++ * ++ * Register protocol dependent address dumping routines. The handler ++ * that is passed must not be freed or reused until it has been replaced ++ * by another handler. ++ */ ++ ++int register_gifconf(unsigned int family, gifconf_func_t * gifconf) ++{ ++ if (family>=NPROTO) ++ return -EINVAL; ++ gifconf_list[family] = gifconf; ++ return 0; ++} ++ ++ ++/* ++ * Map an interface index to its name (SIOCGIFNAME) ++ */ ++ ++/* ++ * We need this ioctl for efficient implementation of the ++ * if_indextoname() function required by the IPv6 API. Without ++ * it, we would have to search all the interfaces to find a ++ * match. --pb ++ */ ++ ++static int dev_ifname(struct ifreq *arg) ++{ ++ struct net_device *dev; ++ struct ifreq ifr; ++ ++ /* ++ * Fetch the caller's info block. ++ */ ++ ++ if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) ++ return -EFAULT; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_index(ifr.ifr_ifindex); ++ if (!dev) { ++ read_unlock(&dev_base_lock); ++ return -ENODEV; ++ } ++ ++ strcpy(ifr.ifr_name, dev->name); ++ read_unlock(&dev_base_lock); ++ ++ if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ return 0; ++} ++ ++/* ++ * Perform a SIOCGIFCONF call. This structure will change ++ * size eventually, and there is nothing I can do about it. ++ * Thus we will need a 'compatibility mode'. ++ */ ++ ++static int dev_ifconf(char *arg) ++{ ++ struct ifconf ifc; ++ struct net_device *dev; ++ char *pos; ++ int len; ++ int total; ++ int i; ++ ++ /* ++ * Fetch the caller's info block. ++ */ ++ ++ if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) ++ return -EFAULT; ++ ++ pos = ifc.ifc_buf; ++ len = ifc.ifc_len; ++ ++ /* ++ * Loop over the interfaces, and write an info block for each. ++ */ ++ ++ total = 0; ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ for (i=0; iget_stats ? dev->get_stats(dev): NULL); ++ int size; ++ ++ if (stats) ++ size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu %8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", ++ dev->name, ++ stats->rx_bytes, ++ stats->rx_packets, stats->rx_errors, ++ stats->rx_dropped + stats->rx_missed_errors, ++ stats->rx_fifo_errors, ++ stats->rx_length_errors + stats->rx_over_errors ++ + stats->rx_crc_errors + stats->rx_frame_errors, ++ stats->rx_compressed, stats->multicast, ++ stats->tx_bytes, ++ stats->tx_packets, stats->tx_errors, stats->tx_dropped, ++ stats->tx_fifo_errors, stats->collisions, ++ stats->tx_carrier_errors + stats->tx_aborted_errors ++ + stats->tx_window_errors + stats->tx_heartbeat_errors, ++ stats->tx_compressed); ++ else ++ size = sprintf(buffer, "%6s: No statistics available.\n", dev->name); ++ ++ return size; ++} ++ ++/* ++ * Called from the PROCfs module. This now uses the new arbitrary sized /proc/net interface ++ * to create /proc/net/dev ++ */ ++ ++static int dev_get_info(char *buffer, char **start, off_t offset, int length) ++{ ++ int len = 0; ++ off_t begin = 0; ++ off_t pos = 0; ++ int size; ++ struct net_device *dev; ++ ++ ++ size = sprintf(buffer, ++ "Inter-| Receive | Transmit\n" ++ " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n"); ++ ++ pos += size; ++ len += size; ++ ++ ++ read_lock(&dev_base_lock); ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ size = sprintf_stats(buffer+len, dev); ++ len += size; ++ pos = begin + len; ++ ++ if (pos < offset) { ++ len = 0; ++ begin = pos; ++ } ++ if (pos > offset + length) ++ break; ++ } ++ read_unlock(&dev_base_lock); ++ ++ *start = buffer + (offset - begin); /* Start of wanted data */ ++ len -= (offset - begin); /* Start slop */ ++ if (len > length) ++ len = length; /* Ending slop */ ++ if (len < 0) ++ len = 0; ++ return len; ++} ++ ++static int dev_proc_stats(char *buffer, char **start, off_t offset, ++ int length, int *eof, void *data) ++{ ++ int i, lcpu; ++ int len=0; ++ ++ for (lcpu=0; lcpu length) ++ len = length; ++ if (len < 0) ++ len = 0; ++ ++ *start = buffer + offset; ++ *eof = 1; ++ ++ return len; ++} ++ ++#endif /* CONFIG_PROC_FS */ ++ ++ ++/** ++ * netdev_set_master - set up master/slave pair ++ * @slave: slave device ++ * @master: new master device ++ * ++ * Changes the master device of the slave. Pass %NULL to break the ++ * bonding. The caller must hold the RTNL semaphore. On a failure ++ * a negative errno code is returned. On success the reference counts ++ * are adjusted, %RTM_NEWLINK is sent to the routing socket and the ++ * function returns zero. ++ */ ++ ++int netdev_set_master(struct net_device *slave, struct net_device *master) ++{ ++ struct net_device *old = slave->master; ++ ++ ASSERT_RTNL(); ++ ++ if (master) { ++ if (old) ++ return -EBUSY; ++ dev_hold(master); ++ } ++ ++ br_write_lock_bh(BR_NETPROTO_LOCK); ++ slave->master = master; ++ br_write_unlock_bh(BR_NETPROTO_LOCK); ++ ++ if (old) ++ dev_put(old); ++ ++ if (master) ++ slave->flags |= IFF_SLAVE; ++ else ++ slave->flags &= ~IFF_SLAVE; ++ ++ rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); ++ return 0; ++} ++ ++/** ++ * dev_set_promiscuity - update promiscuity count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove promsicuity from a device. While the count in the device ++ * remains above zero the interface remains promiscuous. Once it hits zero ++ * the device reverts back to normal filtering operation. A negative inc ++ * value is used to drop promiscuity on the device. ++ */ ++ ++void dev_set_promiscuity(struct net_device *dev, int inc) ++{ ++ unsigned short old_flags = dev->flags; ++ ++ dev->flags |= IFF_PROMISC; ++ if ((dev->promiscuity += inc) == 0) ++ dev->flags &= ~IFF_PROMISC; ++ if (dev->flags^old_flags) { ++#ifdef CONFIG_NET_FASTROUTE ++ if (dev->flags&IFF_PROMISC) { ++ netdev_fastroute_obstacles++; ++ dev_clear_fastroute(dev); ++ } else ++ netdev_fastroute_obstacles--; ++#endif ++ dev_mc_upload(dev); ++ printk(KERN_INFO "device %s %s promiscuous mode\n", ++ dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left"); ++ } ++} ++ ++/** ++ * dev_set_allmulti - update allmulti count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove reception of all multicast frames to a device. While the ++ * count in the device remains above zero the interface remains listening ++ * to all interfaces. Once it hits zero the device reverts back to normal ++ * filtering operation. A negative @inc value is used to drop the counter ++ * when releasing a resource needing all multicasts. ++ */ ++ ++void dev_set_allmulti(struct net_device *dev, int inc) ++{ ++ unsigned short old_flags = dev->flags; ++ ++ dev->flags |= IFF_ALLMULTI; ++ if ((dev->allmulti += inc) == 0) ++ dev->flags &= ~IFF_ALLMULTI; ++ if (dev->flags^old_flags) ++ dev_mc_upload(dev); ++} ++ ++int dev_change_flags(struct net_device *dev, unsigned flags) ++{ ++ int ret; ++ int old_flags = dev->flags; ++ ++ /* ++ * Set the flags on our device. ++ */ ++ ++ dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP|IFF_DYNAMIC| ++ IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) | ++ (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI)); ++ ++ /* ++ * Load in the correct multicast list now the flags have changed. ++ */ ++ ++ dev_mc_upload(dev); ++ ++ /* ++ * Have we downed the interface. We handle IFF_UP ourselves ++ * according to user attempts to set it, rather than blindly ++ * setting it. ++ */ ++ ++ ret = 0; ++ if ((old_flags^flags)&IFF_UP) /* Bit is different ? */ ++ { ++ ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); ++ ++ if (ret == 0) ++ dev_mc_upload(dev); ++ } ++ ++ if (dev->flags&IFF_UP && ++ ((old_flags^dev->flags)&~(IFF_UP|IFF_PROMISC|IFF_ALLMULTI|IFF_VOLATILE))) ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); ++ ++ if ((flags^dev->gflags)&IFF_PROMISC) { ++ int inc = (flags&IFF_PROMISC) ? +1 : -1; ++ dev->gflags ^= IFF_PROMISC; ++ dev_set_promiscuity(dev, inc); ++ } ++ ++ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI ++ is important. Some (broken) drivers set IFF_PROMISC, when ++ IFF_ALLMULTI is requested not asking us and not reporting. ++ */ ++ if ((flags^dev->gflags)&IFF_ALLMULTI) { ++ int inc = (flags&IFF_ALLMULTI) ? +1 : -1; ++ dev->gflags ^= IFF_ALLMULTI; ++ dev_set_allmulti(dev, inc); ++ } ++ ++ if (old_flags^dev->flags) ++ rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags^dev->flags); ++ ++ return ret; ++} ++ ++/* ++ * Perform the SIOCxIFxxx calls. ++ */ ++ ++static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) ++{ ++ struct net_device *dev; ++ int err; ++ ++ if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL) ++ return -ENODEV; ++ ++ switch(cmd) ++ { ++ case SIOCGIFFLAGS: /* Get interface flags */ ++ ifr->ifr_flags = (dev->flags&~(IFF_PROMISC|IFF_ALLMULTI|IFF_RUNNING)) ++ |(dev->gflags&(IFF_PROMISC|IFF_ALLMULTI)); ++ if (netif_running(dev) && netif_carrier_ok(dev)) ++ ifr->ifr_flags |= IFF_RUNNING; ++ return 0; ++ ++ case SIOCSIFFLAGS: /* Set interface flags */ ++ return dev_change_flags(dev, ifr->ifr_flags); ++ ++ case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ ++ ifr->ifr_metric = 0; ++ return 0; ++ ++ case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ ++ return -EOPNOTSUPP; ++ ++ case SIOCGIFMTU: /* Get the MTU of a device */ ++ ifr->ifr_mtu = dev->mtu; ++ return 0; ++ ++ case SIOCSIFMTU: /* Set the MTU of a device */ ++ if (ifr->ifr_mtu == dev->mtu) ++ return 0; ++ ++ /* ++ * MTU must be positive. ++ */ ++ ++ if (ifr->ifr_mtu<0) ++ return -EINVAL; ++ ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ ++ if (dev->change_mtu) ++ err = dev->change_mtu(dev, ifr->ifr_mtu); ++ else { ++ dev->mtu = ifr->ifr_mtu; ++ err = 0; ++ } ++ if (!err && dev->flags&IFF_UP) ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev); ++ return err; ++ ++ case SIOCGIFHWADDR: ++ memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); ++ ifr->ifr_hwaddr.sa_family=dev->type; ++ return 0; ++ ++ case SIOCSIFHWADDR: ++ if (dev->set_mac_address == NULL) ++ return -EOPNOTSUPP; ++ if (ifr->ifr_hwaddr.sa_family!=dev->type) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ err = dev->set_mac_address(dev, &ifr->ifr_hwaddr); ++ if (!err) ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); ++ return err; ++ ++ case SIOCSIFHWBROADCAST: ++ if (ifr->ifr_hwaddr.sa_family!=dev->type) ++ return -EINVAL; ++ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN); ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); ++ return 0; ++ ++ case SIOCGIFMAP: ++ ifr->ifr_map.mem_start=dev->mem_start; ++ ifr->ifr_map.mem_end=dev->mem_end; ++ ifr->ifr_map.base_addr=dev->base_addr; ++ ifr->ifr_map.irq=dev->irq; ++ ifr->ifr_map.dma=dev->dma; ++ ifr->ifr_map.port=dev->if_port; ++ return 0; ++ ++ case SIOCSIFMAP: ++ if (dev->set_config) { ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return dev->set_config(dev,&ifr->ifr_map); ++ } ++ return -EOPNOTSUPP; ++ ++ case SIOCADDMULTI: ++ if (dev->set_multicast_list == NULL || ++ ifr->ifr_hwaddr.sa_family != AF_UNSPEC) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1); ++ return 0; ++ ++ case SIOCDELMULTI: ++ if (dev->set_multicast_list == NULL || ++ ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1); ++ return 0; ++ ++ case SIOCGIFINDEX: ++ ifr->ifr_ifindex = dev->ifindex; ++ return 0; ++ ++ case SIOCGIFTXQLEN: ++ ifr->ifr_qlen = dev->tx_queue_len; ++ return 0; ++ ++ case SIOCSIFTXQLEN: ++ if (ifr->ifr_qlen<0) ++ return -EINVAL; ++ dev->tx_queue_len = ifr->ifr_qlen; ++ return 0; ++ ++ case SIOCSIFNAME: ++ if (dev->flags&IFF_UP) ++ return -EBUSY; ++ /* Check if name contains a wildcard */ ++ if (strchr(ifr->ifr_newname, '%')) { ++ char format[IFNAMSIZ + 1]; ++ int ret; ++ memcpy(format, ifr->ifr_newname, IFNAMSIZ); ++ format[IFNAMSIZ-1] = 0; ++ /* Find a free name based on format. ++ * dev_alloc_name() replaces "%d" with at max ++ * 2 digits, so no name overflow. - Jean II */ ++ ret = dev_alloc_name(dev, format); ++ if (ret < 0) ++ return ret; ++ /* Copy the new name back to caller. */ ++ strncpy(ifr->ifr_newname, dev->name, IFNAMSIZ); ++ } else { ++ if (__dev_get_by_name(ifr->ifr_newname)) ++ return -EEXIST; ++ memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ); ++ dev->name[IFNAMSIZ-1] = 0; ++ } ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); ++ return 0; ++ ++ /* ++ * Unknown or private ioctl ++ */ ++ ++ default: ++ if ((cmd >= SIOCDEVPRIVATE && ++ cmd <= SIOCDEVPRIVATE + 15) || ++ cmd == SIOCBONDENSLAVE || ++ cmd == SIOCBONDRELEASE || ++ cmd == SIOCBONDSETHWADDR || ++ cmd == SIOCBONDSLAVEINFOQUERY || ++ cmd == SIOCBONDINFOQUERY || ++ cmd == SIOCBONDCHANGEACTIVE || ++ cmd == SIOCGMIIPHY || ++ cmd == SIOCGMIIREG || ++ cmd == SIOCSMIIREG || ++ cmd == SIOCWANDEV) { ++ if (dev->do_ioctl) { ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return dev->do_ioctl(dev, ifr, cmd); ++ } ++ return -EOPNOTSUPP; ++ } ++ ++ } ++ return -EINVAL; ++} ++ ++/* ++ * This function handles all "interface"-type I/O control requests. The actual ++ * 'doing' part of this is dev_ifsioc above. ++ */ ++ ++/** ++ * dev_ioctl - network device ioctl ++ * @cmd: command to issue ++ * @arg: pointer to a struct ifreq in user space ++ * ++ * Issue ioctl functions to devices. This is normally called by the ++ * user space syscall interfaces but can sometimes be useful for ++ * other purposes. The return value is the return from the syscall if ++ * positive or a negative errno code on error. ++ */ ++ ++int dev_ioctl(unsigned int cmd, void *arg) ++{ ++ struct ifreq ifr; ++ int ret; ++ char *colon; ++ ++ /* One special case: SIOCGIFCONF takes ifconf argument ++ and requires shared lock, because it sleeps writing ++ to user space. ++ */ ++ ++ if (cmd == SIOCGIFCONF) { ++ rtnl_shlock(); ++ ret = dev_ifconf((char *) arg); ++ rtnl_shunlock(); ++ return ret; ++ } ++ if (cmd == SIOCGIFNAME) { ++ return dev_ifname((struct ifreq *)arg); ++ } ++ ++ if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) ++ return -EFAULT; ++ ++ ifr.ifr_name[IFNAMSIZ-1] = 0; ++ ++ colon = strchr(ifr.ifr_name, ':'); ++ if (colon) ++ *colon = 0; ++ ++ /* ++ * See which interface the caller is talking about. ++ */ ++ ++ switch(cmd) ++ { ++ /* ++ * These ioctl calls: ++ * - can be done by all. ++ * - atomic and do not require locking. ++ * - return a value ++ */ ++ ++ case SIOCGIFFLAGS: ++ case SIOCGIFMETRIC: ++ case SIOCGIFMTU: ++ case SIOCGIFHWADDR: ++ case SIOCGIFSLAVE: ++ case SIOCGIFMAP: ++ case SIOCGIFINDEX: ++ case SIOCGIFTXQLEN: ++ dev_load(ifr.ifr_name); ++ read_lock(&dev_base_lock); ++ ret = dev_ifsioc(&ifr, cmd); ++ read_unlock(&dev_base_lock); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ } ++ return ret; ++ ++ case SIOCETHTOOL: ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ethtool(&ifr); ++ rtnl_unlock(); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ } ++ return ret; ++ ++ /* ++ * These ioctl calls: ++ * - require superuser power. ++ * - require strict serialization. ++ * - return a value ++ */ ++ ++ case SIOCSIFNAME: ++ case SIOCGMIIPHY: ++ case SIOCGMIIREG: ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ dev_load(ifr.ifr_name); ++ dev_probe_lock(); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ dev_probe_unlock(); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ } ++ return ret; ++ ++ /* ++ * These ioctl calls: ++ * - require superuser power. ++ * - require strict serialization. ++ * - do not return a value ++ */ ++ ++ case SIOCSIFFLAGS: ++ case SIOCSIFMETRIC: ++ case SIOCSIFMTU: ++ case SIOCSIFMAP: ++ case SIOCSIFHWADDR: ++ case SIOCSIFSLAVE: ++ case SIOCADDMULTI: ++ case SIOCDELMULTI: ++ case SIOCSIFHWBROADCAST: ++ case SIOCSIFTXQLEN: ++ case SIOCSMIIREG: ++ case SIOCBONDENSLAVE: ++ case SIOCBONDRELEASE: ++ case SIOCBONDSETHWADDR: ++ case SIOCBONDSLAVEINFOQUERY: ++ case SIOCBONDINFOQUERY: ++ case SIOCBONDCHANGEACTIVE: ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ dev_load(ifr.ifr_name); ++ dev_probe_lock(); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ dev_probe_unlock(); ++ return ret; ++ ++ case SIOCGIFMEM: ++ /* Get the per device memory space. We can add this but currently ++ do not support it */ ++ case SIOCSIFMEM: ++ /* Set the per device memory buffer space. Not applicable in our case */ ++ case SIOCSIFLINK: ++ return -EINVAL; ++ ++ /* ++ * Unknown or private ioctl. ++ */ ++ ++ default: ++ if (cmd == SIOCWANDEV || ++ (cmd >= SIOCDEVPRIVATE && ++ cmd <= SIOCDEVPRIVATE + 15)) { ++ dev_load(ifr.ifr_name); ++ dev_probe_lock(); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ dev_probe_unlock(); ++ if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ return ret; ++ } ++#ifdef WIRELESS_EXT ++ /* Take care of Wireless Extensions */ ++ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { ++ /* If command is `set a parameter', or ++ * `get the encoding parameters', check if ++ * the user has the right to do it */ ++ if (IW_IS_SET(cmd) || (cmd == SIOCGIWENCODE)) { ++ if(!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ } ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ /* Follow me in net/core/wireless.c */ ++ ret = wireless_process_ioctl(&ifr, cmd); ++ rtnl_unlock(); ++ if (!ret && IW_IS_GET(cmd) && ++ copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ return ret; ++ } ++#endif /* WIRELESS_EXT */ ++ return -EINVAL; ++ } ++} ++ ++ ++/** ++ * dev_new_index - allocate an ifindex ++ * ++ * Returns a suitable unique value for a new device interface ++ * number. The caller must hold the rtnl semaphore or the ++ * dev_base_lock to be sure it remains unique. ++ */ ++ ++int dev_new_index(void) ++{ ++ static int ifindex; ++ for (;;) { ++ if (++ifindex <= 0) ++ ifindex=1; ++ if (__dev_get_by_index(ifindex) == NULL) ++ return ifindex; ++ } ++} ++ ++static int dev_boot_phase = 1; ++ ++/** ++ * register_netdevice - register a network device ++ * @dev: device to register ++ * ++ * Take a completed network device structure and add it to the kernel ++ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier ++ * chain. 0 is returned on success. A negative errno code is returned ++ * on a failure to set up the device, or if the name is a duplicate. ++ * ++ * Callers must hold the rtnl semaphore. See the comment at the ++ * end of Space.c for details about the locking. You may want ++ * register_netdev() instead of this. ++ * ++ * BUGS: ++ * The locking appears insufficient to guarantee two parallel registers ++ * will not get the same name. ++ */ ++ ++int net_dev_init(void); ++ ++int register_netdevice(struct net_device *dev) ++{ ++ struct net_device *d, **dp; ++#ifdef CONFIG_NET_DIVERT ++ int ret; ++#endif ++ ++ spin_lock_init(&dev->queue_lock); ++ spin_lock_init(&dev->xmit_lock); ++ dev->xmit_lock_owner = -1; ++#ifdef CONFIG_NET_FASTROUTE ++ dev->fastpath_lock=RW_LOCK_UNLOCKED; ++#endif ++ ++ if (dev_boot_phase) ++ net_dev_init(); ++ ++#ifdef CONFIG_NET_DIVERT ++ ret = alloc_divert_blk(dev); ++ if (ret) ++ return ret; ++#endif /* CONFIG_NET_DIVERT */ ++ ++ dev->iflink = -1; ++ ++ /* Init, if this function is available */ ++ if (dev->init && dev->init(dev) != 0) { ++#ifdef CONFIG_NET_DIVERT ++ free_divert_blk(dev); ++#endif ++ return -EIO; ++ } ++ ++ dev->ifindex = dev_new_index(); ++ if (dev->iflink == -1) ++ dev->iflink = dev->ifindex; ++ ++ /* Check for existence, and append to tail of chain */ ++ for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { ++ if (d == dev || strcmp(d->name, dev->name) == 0) { ++#ifdef CONFIG_NET_DIVERT ++ free_divert_blk(dev); ++#endif ++ return -EEXIST; ++ } ++ } ++ ++ /* Fix illegal SG+CSUM combinations. */ ++ if ((dev->features & NETIF_F_SG) && ++ !(dev->features & (NETIF_F_IP_CSUM | ++ NETIF_F_NO_CSUM | ++ NETIF_F_HW_CSUM))) { ++ printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", ++ dev->name); ++ dev->features &= ~NETIF_F_SG; ++ } ++ ++ /* ++ * nil rebuild_header routine, ++ * that should be never called and used as just bug trap. ++ */ ++ ++ if (dev->rebuild_header == NULL) ++ dev->rebuild_header = default_rebuild_header; ++ ++ /* ++ * Default initial state at registry is that the ++ * device is present. ++ */ ++ ++ set_bit(__LINK_STATE_PRESENT, &dev->state); ++ ++ dev->next = NULL; ++ dev_init_scheduler(dev); ++ write_lock_bh(&dev_base_lock); ++ *dp = dev; ++ dev_hold(dev); ++ dev->deadbeaf = 0; ++ write_unlock_bh(&dev_base_lock); ++ ++ /* Notify protocols, that a new device appeared. */ ++ notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); ++ ++ net_run_sbin_hotplug(dev, "register"); ++ ++ return 0; ++} ++ ++/** ++ * netdev_finish_unregister - complete unregistration ++ * @dev: device ++ * ++ * Destroy and free a dead device. A value of zero is returned on ++ * success. ++ */ ++ ++int netdev_finish_unregister(struct net_device *dev) ++{ ++ BUG_TRAP(dev->ip_ptr==NULL); ++ BUG_TRAP(dev->ip6_ptr==NULL); ++ BUG_TRAP(dev->dn_ptr==NULL); ++ ++ if (!dev->deadbeaf) { ++ printk(KERN_ERR "Freeing alive device %p, %s\n", dev, dev->name); ++ return 0; ++ } ++#ifdef NET_REFCNT_DEBUG ++ printk(KERN_DEBUG "netdev_finish_unregister: %s%s.\n", dev->name, ++ (dev->features & NETIF_F_DYNALLOC)?"":", old style"); ++#endif ++ if (dev->destructor) ++ dev->destructor(dev); ++ if (dev->features & NETIF_F_DYNALLOC) ++ kfree(dev); ++ return 0; ++} ++ ++/** ++ * unregister_netdevice - remove device from the kernel ++ * @dev: device ++ * ++ * This function shuts down a device interface and removes it ++ * from the kernel tables. On success 0 is returned, on a failure ++ * a negative errno code is returned. ++ * ++ * Callers must hold the rtnl semaphore. See the comment at the ++ * end of Space.c for details about the locking. You may want ++ * unregister_netdev() instead of this. ++ */ ++ ++int unregister_netdevice(struct net_device *dev) ++{ ++ unsigned long now, warning_time; ++ struct net_device *d, **dp; ++ ++ /* If device is running, close it first. */ ++ if (dev->flags & IFF_UP) ++ dev_close(dev); ++ ++ BUG_TRAP(dev->deadbeaf==0); ++ dev->deadbeaf = 1; ++ ++ /* And unlink it from device chain. */ ++ for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) { ++ if (d == dev) { ++ write_lock_bh(&dev_base_lock); ++ *dp = d->next; ++ write_unlock_bh(&dev_base_lock); ++ break; ++ } ++ } ++ if (d == NULL) { ++ printk(KERN_DEBUG "unregister_netdevice: device %s/%p never was registered\n", dev->name, dev); ++ return -ENODEV; ++ } ++ ++ /* Synchronize to net_rx_action. */ ++ br_write_lock_bh(BR_NETPROTO_LOCK); ++ br_write_unlock_bh(BR_NETPROTO_LOCK); ++ ++ if (dev_boot_phase == 0) { ++#ifdef CONFIG_NET_FASTROUTE ++ dev_clear_fastroute(dev); ++#endif ++ ++ /* Shutdown queueing discipline. */ ++ dev_shutdown(dev); ++ ++ net_run_sbin_hotplug(dev, "unregister"); ++ ++ /* Notify protocols, that we are about to destroy ++ this device. They should clean all the things. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); ++ ++ /* ++ * Flush the multicast chain ++ */ ++ dev_mc_discard(dev); ++ } ++ ++ if (dev->uninit) ++ dev->uninit(dev); ++ ++ /* Notifier chain MUST detach us from master device. */ ++ BUG_TRAP(dev->master==NULL); ++ ++#ifdef CONFIG_NET_DIVERT ++ free_divert_blk(dev); ++#endif ++ ++ if (dev->features & NETIF_F_DYNALLOC) { ++#ifdef NET_REFCNT_DEBUG ++ if (atomic_read(&dev->refcnt) != 1) ++ printk(KERN_DEBUG "unregister_netdevice: holding %s refcnt=%d\n", dev->name, atomic_read(&dev->refcnt)-1); ++#endif ++ dev_put(dev); ++ return 0; ++ } ++ ++ /* Last reference is our one */ ++ if (atomic_read(&dev->refcnt) == 1) { ++ dev_put(dev); ++ return 0; ++ } ++ ++#ifdef NET_REFCNT_DEBUG ++ printk("unregister_netdevice: waiting %s refcnt=%d\n", dev->name, atomic_read(&dev->refcnt)); ++#endif ++ ++ /* EXPLANATION. If dev->refcnt is not now 1 (our own reference) ++ it means that someone in the kernel still has a reference ++ to this device and we cannot release it. ++ ++ "New style" devices have destructors, hence we can return from this ++ function and destructor will do all the work later. As of kernel 2.4.0 ++ there are very few "New Style" devices. ++ ++ "Old style" devices expect that the device is free of any references ++ upon exit from this function. ++ We cannot return from this function until all such references have ++ fallen away. This is because the caller of this function will probably ++ immediately kfree(*dev) and then be unloaded via sys_delete_module. ++ ++ So, we linger until all references fall away. The duration of the ++ linger is basically unbounded! It is driven by, for example, the ++ current setting of sysctl_ipfrag_time. ++ ++ After 1 second, we start to rebroadcast unregister notifications ++ in hope that careless clients will release the device. ++ ++ */ ++ ++ now = warning_time = jiffies; ++ while (atomic_read(&dev->refcnt) != 1) { ++ if ((jiffies - now) > 1*HZ) { ++ /* Rebroadcast unregister notification */ ++ notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); ++ } ++ current->state = TASK_INTERRUPTIBLE; ++ schedule_timeout(HZ/4); ++ current->state = TASK_RUNNING; ++ if ((jiffies - warning_time) > 10*HZ) { ++ printk(KERN_EMERG "unregister_netdevice: waiting for %s to " ++ "become free. Usage count = %d\n", ++ dev->name, atomic_read(&dev->refcnt)); ++ warning_time = jiffies; ++ } ++ } ++ dev_put(dev); ++ return 0; ++} ++ ++ ++/* ++ * Initialize the DEV module. At boot time this walks the device list and ++ * unhooks any devices that fail to initialise (normally hardware not ++ * present) and leaves us with a valid list of present and active devices. ++ * ++ */ ++ ++extern void net_device_init(void); ++extern void ip_auto_config(void); ++struct proc_dir_entry *proc_net_drivers; ++#ifdef CONFIG_NET_DIVERT ++extern void dv_init(void); ++#endif /* CONFIG_NET_DIVERT */ ++ ++ ++/* ++ * Callers must hold the rtnl semaphore. See the comment at the ++ * end of Space.c for details about the locking. ++ */ ++int __init net_dev_init(void) ++{ ++ struct net_device *dev, **dp; ++ int i; ++ ++ if (!dev_boot_phase) ++ return 0; ++ ++ ++#ifdef CONFIG_NET_DIVERT ++ dv_init(); ++#endif /* CONFIG_NET_DIVERT */ ++ ++ /* ++ * Initialise the packet receive queues. ++ */ ++ ++ for (i = 0; i < NR_CPUS; i++) { ++ struct softnet_data *queue; ++ ++ queue = &softnet_data[i]; ++ skb_queue_head_init(&queue->input_pkt_queue); ++ queue->throttle = 0; ++ queue->cng_level = 0; ++ queue->avg_blog = 10; /* arbitrary non-zero */ ++ queue->completion_queue = NULL; ++ INIT_LIST_HEAD(&queue->poll_list); ++ set_bit(__LINK_STATE_START, &queue->blog_dev.state); ++ queue->blog_dev.weight = weight_p; ++ queue->blog_dev.poll = process_backlog; ++ atomic_set(&queue->blog_dev.refcnt, 1); ++ } ++ ++#ifdef CONFIG_NET_PROFILE ++ net_profile_init(); ++ NET_PROFILE_REGISTER(dev_queue_xmit); ++ NET_PROFILE_REGISTER(softnet_process); ++#endif ++ ++#ifdef OFFLINE_SAMPLE ++ samp_timer.expires = jiffies + (10 * HZ); ++ add_timer(&samp_timer); ++#endif ++ ++ /* ++ * Add the devices. ++ * If the call to dev->init fails, the dev is removed ++ * from the chain disconnecting the device until the ++ * next reboot. ++ * ++ * NB At boot phase networking is dead. No locking is required. ++ * But we still preserve dev_base_lock for sanity. ++ */ ++ ++ dp = &dev_base; ++ while ((dev = *dp) != NULL) { ++ spin_lock_init(&dev->queue_lock); ++ spin_lock_init(&dev->xmit_lock); ++#ifdef CONFIG_NET_FASTROUTE ++ dev->fastpath_lock = RW_LOCK_UNLOCKED; ++#endif ++ dev->xmit_lock_owner = -1; ++ dev->iflink = -1; ++ dev_hold(dev); ++ ++ /* ++ * Allocate name. If the init() fails ++ * the name will be reissued correctly. ++ */ ++ if (strchr(dev->name, '%')) ++ dev_alloc_name(dev, dev->name); ++ ++ /* ++ * Check boot time settings for the device. ++ */ ++ netdev_boot_setup_check(dev); ++ ++ if (dev->init && dev->init(dev)) { ++ /* ++ * It failed to come up. It will be unhooked later. ++ * dev_alloc_name can now advance to next suitable ++ * name that is checked next. ++ */ ++ dev->deadbeaf = 1; ++ dp = &dev->next; ++ } else { ++ dp = &dev->next; ++ dev->ifindex = dev_new_index(); ++ if (dev->iflink == -1) ++ dev->iflink = dev->ifindex; ++ if (dev->rebuild_header == NULL) ++ dev->rebuild_header = default_rebuild_header; ++ dev_init_scheduler(dev); ++ set_bit(__LINK_STATE_PRESENT, &dev->state); ++ } ++ } ++ ++ /* ++ * Unhook devices that failed to come up ++ */ ++ dp = &dev_base; ++ while ((dev = *dp) != NULL) { ++ if (dev->deadbeaf) { ++ write_lock_bh(&dev_base_lock); ++ *dp = dev->next; ++ write_unlock_bh(&dev_base_lock); ++ dev_put(dev); ++ } else { ++ dp = &dev->next; ++ } ++ } ++ ++#ifdef CONFIG_PROC_FS ++ proc_net_create("dev", 0, dev_get_info); ++ create_proc_read_entry("net/softnet_stat", 0, 0, dev_proc_stats, NULL); ++ proc_net_drivers = proc_mkdir("net/drivers", 0); ++#ifdef WIRELESS_EXT ++ /* Available in net/core/wireless.c */ ++ proc_net_create("wireless", 0, dev_get_wireless_info); ++#endif /* WIRELESS_EXT */ ++#endif /* CONFIG_PROC_FS */ ++ ++ dev_boot_phase = 0; ++ ++ open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); ++ open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); ++ ++ dst_init(); ++ dev_mcast_init(); ++ ++#ifdef CONFIG_NET_SCHED ++ pktsched_init(); ++#endif ++ /* ++ * Initialise network devices ++ */ ++ ++ net_device_init(); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG ++ ++/* Notify userspace when a netdevice event occurs, ++ * by running '/sbin/hotplug net' with certain ++ * environment variables set. ++ */ ++ ++static int net_run_sbin_hotplug(struct net_device *dev, char *action) ++{ ++ char *argv[3], *envp[5], ifname[12 + IFNAMSIZ], action_str[32]; ++ int i; ++ ++ sprintf(ifname, "INTERFACE=%s", dev->name); ++ sprintf(action_str, "ACTION=%s", action); ++ ++ i = 0; ++ argv[i++] = hotplug_path; ++ argv[i++] = "net"; ++ argv[i] = 0; ++ ++ i = 0; ++ /* minimal command environment */ ++ envp [i++] = "HOME=/"; ++ envp [i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; ++ envp [i++] = ifname; ++ envp [i++] = action_str; ++ envp [i] = 0; ++ ++ return call_usermodehelper(argv [0], argv, envp); ++} ++#endif +diff --unified --recursive --new-file linux-2.4.30/net/netsyms.c linux-2.4.30-1-686-smp-ring3/net/netsyms.c +--- linux-2.4.30/net/netsyms.c 2005-04-04 03:42:20.000000000 +0200 ++++ linux-2.4.30-1-686-smp-ring3/net/netsyms.c 2005-10-22 23:08:28.016050500 +0200 +@@ -628,3 +628,18 @@ + #endif /* CONFIG_NET_RADIO || CONFIG_NET_PCMCIA_RADIO */ + + #endif /* CONFIG_NET */ ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++#include ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++#include ++ ++EXPORT_SYMBOL(get_skb_ring_handler); ++EXPORT_SYMBOL(set_skb_ring_handler); ++EXPORT_SYMBOL(do_skb_ring_handler); ++EXPORT_SYMBOL(get_buffer_ring_handler); ++EXPORT_SYMBOL(set_buffer_ring_handler); ++EXPORT_SYMBOL(do_buffer_ring_handler); ++#endif ++ ++#endif +diff --unified --recursive --new-file linux-2.4.30/net/ring/Config.in linux-2.4.30-1-686-smp-ring3/net/ring/Config.in +--- linux-2.4.30/net/ring/Config.in 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/ring/Config.in 2005-10-22 23:08:28.048052500 +0200 +@@ -0,0 +1,4 @@ ++# ++# PF_RING ++# ++tristate ' PF_RING (EXPERIMENTAL)' CONFIG_RING +diff --unified --recursive --new-file linux-2.4.30/net/ring/Makefile linux-2.4.30-1-686-smp-ring3/net/ring/Makefile +--- linux-2.4.30/net/ring/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/ring/Makefile 2005-10-22 23:08:27.420013250 +0200 +@@ -0,0 +1,16 @@ ++# ++# Makefile for the ring driver. ++# ++ ++O_TARGET := ring.o ++ ++export-objs := ring_packet.o ++ ++obj-y := ring_packet.o ++ ++ifeq ($(CONFIG_RING),m) ++ obj-m += $(O_TARGET) ++endif ++ ++include $(TOPDIR)/Rules.make ++ +diff --unified --recursive --new-file linux-2.4.30/net/ring/ring_packet.c linux-2.4.30-1-686-smp-ring3/net/ring/ring_packet.c +--- linux-2.4.30/net/ring/ring_packet.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.4.30-1-686-smp-ring3/net/ring/ring_packet.c 2005-10-22 23:08:27.440014500 +0200 +@@ -0,0 +1,1623 @@ ++/* ++ * ++ * (C) 2004-05 - Luca Deri ++ * ++ * This code includes patches courtesy of ++ * - Jeff Randall ++ * - Helmut Manck ++ * - Brad Doctor ++ * ++ */ ++ ++/* FIX: add an entry inside the /proc filesystem */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#include ++#else ++#include ++#endif ++#include ++#include /* needed for virt_to_phys() */ ++ ++/* #define RING_DEBUG */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)) ++static inline int remap_page_range(struct vm_area_struct *vma, ++ unsigned long uvaddr, ++ unsigned long paddr, ++ unsigned long size, ++ pgprot_t prot) { ++ return(remap_pfn_range(vma, uvaddr, paddr >> PAGE_SHIFT, ++ size, prot)); ++} ++#endif ++ ++/* ************************************************* */ ++ ++#define CLUSTER_LEN 8 ++ ++struct ring_cluster { ++ u_short cluster_id; /* 0 = no cluster */ ++ u_short num_cluster_elements; ++ enum cluster_type hashing_mode; ++ u_short hashing_id; ++ struct sock *sk[CLUSTER_LEN]; ++ struct ring_cluster *next; /* NULL = last element of the cluster */ ++}; ++ ++/* ************************************************* */ ++ ++struct ring_element { ++ struct list_head list; ++ struct sock *sk; ++}; ++ ++/* ************************************************* */ ++ ++struct ring_opt { ++ struct net_device *ring_netdev; ++ ++ /* Cluster */ ++ u_short cluster_id; /* 0 = no cluster */ ++ ++ /* Reflector */ ++ struct net_device *reflector_dev; ++ ++ /* Packet buffers */ ++ unsigned long order; ++ ++ /* Ring Slots */ ++ unsigned long ring_memory; ++ FlowSlotInfo *slots_info; /* Basically it points to ring_memory */ ++ char *ring_slots; /* Basically it points to ring_memory ++ +sizeof(FlowSlotInfo) */ ++ ++ /* Packet Sampling */ ++ u_int pktToSample, sample_rate; ++ ++ /* BPF Filter */ ++ struct sk_filter *bpfFilter; ++ ++ /* Locks */ ++ atomic_t num_ring_slots_waiters; ++ wait_queue_head_t ring_slots_waitqueue; ++ rwlock_t ring_index_lock; ++ ++ /* Indexes (Internal) */ ++ u_int insert_page_id, insert_slot_id; ++}; ++ ++/* ************************************************* */ ++ ++/* List of all ring sockets. */ ++static struct list_head ring_table; ++ ++/* List of all clusters */ ++static struct ring_cluster *ring_cluster_list; ++ ++static rwlock_t ring_mgmt_lock = RW_LOCK_UNLOCKED; ++ ++/* ********************************** */ ++ ++/* Forward */ ++static struct proto_ops ring_ops; ++ ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++static struct proto ring_proto; ++#endif ++ ++static int skb_ring_handler(struct sk_buff *skb, u_char recv_packet, ++ u_char real_skb); ++static int buffer_ring_handler(struct net_device *dev, char *data, int len); ++static int remove_from_cluster(struct sock *sock, struct ring_opt *pfr); ++ ++/* Extern */ ++ ++/* ********************************** */ ++ ++/* Defaults */ ++static u_int bucket_len = 128, num_slots = 4096, sample_rate = 1, ++ transparent_mode = 0, enable_tx_capture = 0; ++ ++MODULE_PARM(bucket_len, "i"); ++MODULE_PARM_DESC(bucket_len, "Number of ring buckets"); ++MODULE_PARM(num_slots, "i"); ++MODULE_PARM_DESC(num_slots, "Number of ring slots"); ++MODULE_PARM(sample_rate, "i"); ++MODULE_PARM_DESC(sample_rate, "Ring packet sample rate"); ++MODULE_PARM(transparent_mode, "i"); ++MODULE_PARM_DESC(transparent_mode, ++ "Set to 1 to set transparent mode " ++ "(slower but backwards compatible)"); ++MODULE_PARM(enable_tx_capture, "i"); ++MODULE_PARM_DESC(enable_tx_capture, "Set to 1 to capture outgoing packets"); ++ ++/* ********************************** */ ++ ++#define MIN_QUEUED_PKTS 64 ++#define MAX_QUEUE_LOOPS 64 ++ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#define ring_sk_datatype(__sk) ((struct ring_opt *)__sk) ++#define ring_sk(__sk) ((__sk)->sk_protinfo) ++#else ++#define ring_sk_datatype(a) (a) ++#define ring_sk(__sk) ((__sk)->protinfo.pf_ring) ++#endif ++ ++#define _rdtsc() ({ uint64_t x; asm volatile("rdtsc" : "=A" (x)); x; }) ++ ++/* ++ int dev_queue_xmit(struct sk_buff *skb) ++ skb->dev; ++ struct net_device *dev_get_by_name(const char *name) ++*/ ++ ++/* ********************************** */ ++ ++static void ring_sock_destruct(struct sock *sk) { ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ skb_queue_purge(&sk->sk_receive_queue); ++ ++ if (!sock_flag(sk, SOCK_DEAD)) { ++#if defined(RING_DEBUG) ++ printk("Attempt to release alive ring socket: %p\n", sk); ++#endif ++ return; ++ } ++ ++ BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); ++ BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); ++#else ++ ++ BUG_TRAP(atomic_read(&sk->rmem_alloc)==0); ++ BUG_TRAP(atomic_read(&sk->wmem_alloc)==0); ++ ++ if (!sk->dead) { ++#if defined(RING_DEBUG) ++ printk("Attempt to release alive ring socket: %p\n", sk); ++#endif ++ return; ++ } ++#endif ++ ++ kfree(ring_sk(sk)); ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_DEC_USE_COUNT; ++#endif ++} ++ ++/* ********************************** */ ++/* ++ * ring_insert() ++ * ++ * store the sk in a new element and add it ++ * to the head of the list. ++ */ ++static inline void ring_insert(struct sock *sk) { ++ struct ring_element *next; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_insert()\n"); ++#endif ++ ++ next = kmalloc(sizeof(struct ring_element), GFP_ATOMIC); ++ if(next != NULL) { ++ next->sk = sk; ++ write_lock_irq(&ring_mgmt_lock); ++ list_add(&next->list, &ring_table); ++ write_unlock_irq(&ring_mgmt_lock); ++ } else { ++ if (net_ratelimit()) ++ printk("RING: could not kmalloc slot!!\n"); ++ } ++} ++ ++/* ********************************** */ ++/* ++ * ring_remove() ++ * ++ * For each of the elements in the list: ++ * - check if this is the element we want to delete ++ * - if it is, remove it from the list, and free it. ++ * ++ * stop when we find the one we're looking for (break), ++ * or when we reach the end of the list. ++ */ ++static inline void ring_remove(struct sock *sk) { ++ struct list_head *ptr; ++ struct ring_element *entry; ++ ++ ++ for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ entry = list_entry(ptr, struct ring_element, list); ++ ++ if(entry->sk == sk) { ++ write_lock_irq(&ring_mgmt_lock); ++ list_del(ptr); ++ kfree(ptr); ++ write_unlock_irq(&ring_mgmt_lock); ++ break; ++ } ++ } ++ ++} ++ ++/* ********************************** */ ++ ++static u_int32_t num_queued_pkts(struct ring_opt *pfr) { ++ ++ if(pfr->ring_slots != NULL) { ++ ++ u_int32_t tot_insert = pfr->slots_info->insert_idx, ++#if defined(RING_DEBUG) ++ tot_read = pfr->slots_info->tot_read, tot_pkts; ++#else ++ tot_read = pfr->slots_info->tot_read; ++#endif ++ ++ if(tot_insert >= tot_read) { ++#if defined(RING_DEBUG) ++ tot_pkts = tot_insert-tot_read; ++#endif ++ return(tot_insert-tot_read); ++ } else { ++#if defined(RING_DEBUG) ++ tot_pkts = ((u_int32_t)-1)+tot_insert-tot_read; ++#endif ++ return(((u_int32_t)-1)+tot_insert-tot_read); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("-> num_queued_pkts=%d [tot_insert=%d][tot_read=%d]\n", ++ tot_pkts, tot_insert, tot_read); ++#endif ++ ++ } else ++ return(0); ++} ++ ++/* ********************************** */ ++ ++static inline FlowSlot* get_insert_slot(struct ring_opt *pfr) { ++#if defined(RING_DEBUG) ++ printk("get_insert_slot(%d)\n", pfr->slots_info->insert_idx); ++#endif ++ ++ if(pfr->ring_slots != NULL) { ++ FlowSlot *slot = (FlowSlot*)&(pfr->ring_slots[pfr->slots_info->insert_idx ++ *pfr->slots_info->slot_len]); ++ return(slot); ++ } else ++ return(NULL); ++} ++ ++/* ********************************** */ ++ ++static inline FlowSlot* get_remove_slot(struct ring_opt *pfr) { ++#if defined(RING_DEBUG) ++ printk("get_remove_slot(%d)\n", pfr->slots_info->remove_idx); ++#endif ++ ++ if(pfr->ring_slots != NULL) ++ return((FlowSlot*)&(pfr->ring_slots[pfr->slots_info->remove_idx* ++ pfr->slots_info->slot_len])); ++ else ++ return(NULL); ++} ++ ++/* ********************************** */ ++ ++static void add_skb_to_ring(struct sk_buff *skb, ++ struct ring_opt *pfr, ++ u_char recv_packet, ++ u_char real_skb /* 1=skb 0=faked skb */) { ++ FlowSlot *theSlot; ++ int idx, displ; ++ ++ if(recv_packet) { ++ /* Hack for identifying a packet received by the e1000 */ ++ if(real_skb) { ++ displ = SKB_DISPLACEMENT; ++ } else ++ displ = 0; /* Received by the e1000 wrapper */ ++ } else ++ displ = 0; ++ ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_pkts++; ++ write_unlock(&pfr->ring_index_lock); ++ ++ /* BPF Filtering (from af_packet.c) */ ++ if(pfr->bpfFilter != NULL) { ++ unsigned res = 1, len; ++ ++ len = skb->len-skb->data_len; ++ ++ write_lock(&pfr->ring_index_lock); ++ skb->data -= displ; ++ res = sk_run_filter(skb, pfr->bpfFilter->insns, pfr->bpfFilter->len); ++ skb->data += displ; ++ write_unlock(&pfr->ring_index_lock); ++ ++ if(res == 0) { ++ /* Filter failed */ ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): Filter failed [len=%d][tot=%llu]" ++ "[insertIdx=%d][pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ ++ return; ++ } ++ } ++ ++ /* ************************** */ ++ ++ if(pfr->sample_rate > 1) { ++ if(pfr->pktToSample == 0) { ++ write_lock(&pfr->ring_index_lock); ++ pfr->pktToSample = pfr->sample_rate; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->pktToSample--; ++ write_unlock(&pfr->ring_index_lock); ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): sampled packet [len=%d]" ++ "[tot=%llu][insertIdx=%d][pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ return; ++ } ++ } ++ ++ /* ************************************* */ ++ ++ if((pfr->reflector_dev != NULL) ++ && (!netif_queue_stopped(pfr->reflector_dev))) { ++ int cpu = smp_processor_id(); ++ ++ /* increase reference counter so that this skb is not freed */ ++ atomic_inc(&skb->users); ++ ++ skb->data -= displ; ++ ++ /* send it */ ++ if (pfr->reflector_dev->xmit_lock_owner != cpu) { ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = cpu; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++ ++ if (pfr->reflector_dev->hard_start_xmit(skb, ++ pfr->reflector_dev) == 0) { ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = -1; ++ skb->data += displ; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++#if defined(RING_DEBUG) ++ printk("++ hard_start_xmit succeeded\n"); ++#endif ++ return; /* OK */ ++ } ++ ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = -1; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("++ hard_start_xmit failed\n"); ++#endif ++ skb->data += displ; ++ return; /* -ENETDOWN */ ++ } ++ ++ /* ************************************* */ ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb) [len=%d][tot=%llu][insertIdx=%d]" ++ "[pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ ++ idx = pfr->slots_info->insert_idx; ++ theSlot = get_insert_slot(pfr); ++ ++ if((theSlot != NULL) && (theSlot->slot_state == 0)) { ++ struct pcap_pkthdr *hdr; ++ unsigned int bucketSpace; ++ char *bucket; ++ ++ /* Update Index */ ++ idx++; ++ ++ if(idx == pfr->slots_info->tot_slots) { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->insert_idx = 0; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->insert_idx = idx; ++ write_unlock(&pfr->ring_index_lock); ++ } ++ ++ bucketSpace = pfr->slots_info->slot_len ++#ifdef RING_MAGIC ++ - sizeof(u_char) ++#endif ++ - sizeof(u_char) /* flowSlot.slot_state */ ++ - sizeof(struct pcap_pkthdr) ++ - 1 /* 10 */ /* safe boundary */; ++ ++ bucket = &theSlot->bucket; ++ hdr = (struct pcap_pkthdr*)bucket; ++ ++ if(skb->stamp.tv_sec == 0) do_gettimeofday(&skb->stamp); ++ ++ hdr->ts.tv_sec = skb->stamp.tv_sec, hdr->ts.tv_usec = skb->stamp.tv_usec; ++ hdr->caplen = skb->len+displ; ++ ++ if(hdr->caplen > bucketSpace) ++ hdr->caplen = bucketSpace; ++ ++ hdr->len = skb->len+displ; ++ memcpy(&bucket[sizeof(struct pcap_pkthdr)], ++ skb->data-displ, hdr->caplen); ++ ++#if defined(RING_DEBUG) ++ { ++ static unsigned int lastLoss = 0; ++ ++ if(pfr->slots_info->tot_lost ++ && (lastLoss != pfr->slots_info->tot_lost)) { ++ printk("add_skb_to_ring(%d): [bucketSpace=%d]" ++ "[hdr.caplen=%d][skb->len=%d]" ++ "[pcap_pkthdr=%d][removeIdx=%d]" ++ "[loss=%lu][page=%u][slot=%u]\n", ++ idx-1, bucketSpace, hdr->caplen, skb->len, ++ sizeof(struct pcap_pkthdr), ++ pfr->slots_info->remove_idx, ++ (long unsigned int)pfr->slots_info->tot_lost, ++ pfr->insert_page_id, pfr->insert_slot_id); ++ ++ lastLoss = pfr->slots_info->tot_lost; ++ } ++ } ++#endif ++ ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_insert++; ++ theSlot->slot_state = 1; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_lost++; ++ write_unlock(&pfr->ring_index_lock); ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): packet lost [loss=%lu]" ++ "[removeIdx=%u][insertIdx=%u]\n", ++ (long unsigned int)pfr->slots_info->tot_lost, ++ pfr->slots_info->remove_idx, pfr->slots_info->insert_idx); ++#endif ++ } ++ ++ /* wakeup in case of poll() */ ++ if(waitqueue_active(&pfr->ring_slots_waitqueue)) ++ wake_up_interruptible(&pfr->ring_slots_waitqueue); ++} ++ ++/* ********************************** */ ++ ++static u_int hash_skb(struct ring_cluster *cluster_ptr, ++ struct sk_buff *skb, u_char recv_packet) { ++ u_int idx; ++ int displ; ++ struct iphdr *ip; ++ ++ if(cluster_ptr->hashing_mode == cluster_round_robin) { ++ idx = cluster_ptr->hashing_id++; ++ } else { ++ /* Per-flow clustering */ ++ if(skb->len > sizeof(struct iphdr)+sizeof(struct tcphdr)) { ++ if(recv_packet) ++ displ = 0; ++ else ++ displ = SKB_DISPLACEMENT; ++ ++ /* ++ skb->data+displ ++ ++ Always points to to the IP part of the packet ++ */ ++ ++ ip = (struct iphdr*)(skb->data+displ); ++ ++ idx = ip->saddr+ip->daddr+ip->protocol; ++ ++ if(ip->protocol == IPPROTO_TCP) { ++ struct tcphdr *tcp = (struct tcphdr*)(skb->data+displ ++ +sizeof(struct iphdr)); ++ idx += tcp->source+tcp->dest; ++ } else if(ip->protocol == IPPROTO_UDP) { ++ struct udphdr *udp = (struct udphdr*)(skb->data+displ ++ +sizeof(struct iphdr)); ++ idx += udp->source+udp->dest; ++ } ++ } else ++ idx = skb->len; ++ } ++ ++ return(idx % cluster_ptr->num_cluster_elements); ++} ++ ++/* ********************************** */ ++ ++static int skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, ++ u_char real_skb /* 1=skb 0=faked skb */) { ++ struct sock *skElement; ++ int rc = 0; ++ struct list_head *ptr; ++ struct ring_cluster *cluster_ptr; ++ ++#ifdef PROFILING ++ uint64_t rdt = _rdtsc(), rdt1, rdt2; ++#endif ++ ++ if((!skb) /* Invalid skb */ ++ || ((!enable_tx_capture) && (!recv_packet))) { ++ /* ++ An outgoing packet is about to be sent out ++ but we decided not to handle transmitted ++ packets. ++ */ ++ return(0); ++ } ++ ++#if defined(RING_DEBUG) ++ if(0) { ++ printk("skb_ring_handler() [len=%d][dev=%s]\n", skb->len, ++ skb->dev->name == NULL ? "" : skb->dev->name); ++ } ++#endif ++ ++#ifdef PROFILING ++ rdt1 = _rdtsc(); ++#endif ++ ++ /* [1] Check unclustered sockets */ ++ for (ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ struct ring_opt *pfr; ++ struct ring_element *entry; ++ ++ entry = list_entry(ptr, struct ring_element, list); ++ ++ read_lock(&ring_mgmt_lock); ++ skElement = entry->sk; ++ pfr = ring_sk(skElement); ++ read_unlock(&ring_mgmt_lock); ++ ++ if((pfr != NULL) ++ && (pfr->cluster_id == 0 /* No cluster */) ++ && (pfr->ring_slots != NULL) ++ && (pfr->ring_netdev == skb->dev)) { ++ /* We've found the ring where the packet can be stored */ ++ read_lock(&ring_mgmt_lock); ++ add_skb_to_ring(skb, pfr, recv_packet, real_skb); ++ read_unlock(&ring_mgmt_lock); ++ ++ rc = 1; /* Ring found: we've done our job */ ++ } ++ } ++ ++ /* [2] Check socket clusters */ ++ cluster_ptr = ring_cluster_list; ++ ++ while(cluster_ptr != NULL) { ++ struct ring_opt *pfr; ++ ++ if(cluster_ptr->num_cluster_elements > 0) { ++ u_int skb_hash = hash_skb(cluster_ptr, skb, recv_packet); ++ ++ read_lock(&ring_mgmt_lock); ++ skElement = cluster_ptr->sk[skb_hash]; ++ read_unlock(&ring_mgmt_lock); ++ ++ if(skElement != NULL) { ++ pfr = ring_sk(skElement); ++ ++ if((pfr != NULL) ++ && (pfr->ring_slots != NULL) ++ && (pfr->ring_netdev == skb->dev)) { ++ /* We've found the ring where the packet can be stored */ ++ read_lock(&ring_mgmt_lock); ++ add_skb_to_ring(skb, pfr, recv_packet, real_skb); ++ read_unlock(&ring_mgmt_lock); ++ ++ rc = 1; /* Ring found: we've done our job */ ++ } ++ } ++ } ++ ++ cluster_ptr = cluster_ptr->next; ++ } ++ ++#ifdef PROFILING ++ rdt1 = _rdtsc()-rdt1; ++#endif ++ ++#ifdef PROFILING ++ rdt2 = _rdtsc(); ++#endif ++ ++ if(transparent_mode) rc = 0; ++ ++ if((rc != 0) && real_skb) ++ dev_kfree_skb(skb); /* Free the skb */ ++ ++#ifdef PROFILING ++ rdt2 = _rdtsc()-rdt2; ++ rdt = _rdtsc()-rdt; ++ ++#if defined(RING_DEBUG) ++ printk("# cycles: %d [lock costed %d %d%%][free costed %d %d%%]\n", ++ (int)rdt, rdt-rdt1, ++ (int)((float)((rdt-rdt1)*100)/(float)rdt), ++ rdt2, ++ (int)((float)(rdt2*100)/(float)rdt)); ++#endif ++#endif ++ ++ return(rc); /* 0 = packet not handled */ ++} ++ ++/* ********************************** */ ++ ++struct sk_buff skb; ++ ++static int buffer_ring_handler(struct net_device *dev, ++ char *data, int len) { ++ ++#if defined(RING_DEBUG) ++ printk("buffer_ring_handler: [dev=%s][len=%d]\n", ++ dev->name == NULL ? "" : dev->name, len); ++#endif ++ ++ skb.dev = dev, skb.len = len, skb.data = data, ++ skb.data_len = len, skb.stamp.tv_sec = 0; /* Calculate the time */ ++ ++ skb_ring_handler(&skb, 1, 0 /* fake skb */); ++ ++ return(0); ++} ++ ++/* ********************************** */ ++ ++static int ring_create(struct socket *sock, int protocol) { ++ struct sock *sk; ++ struct ring_opt *pfr; ++ int err; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_create()\n"); ++#endif ++ ++ /* Are you root, superuser or so ? */ ++ if(!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ if(sock->type != SOCK_RAW) ++ return -ESOCKTNOSUPPORT; ++ ++ if(protocol != htons(ETH_P_ALL)) ++ return -EPROTONOSUPPORT; ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_INC_USE_COUNT; ++#endif ++ ++ err = -ENOMEM; ++ ++ // BD: -- broke this out to keep it more simple and clear as to what the ++ // options are. ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) ++ sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL); ++#endif ++#endif ++ ++ // BD: API changed in 2.6.12, ref: ++ // http://svn.clkao.org/svnweb/linux/revision/?rev=28201 ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++ sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1); ++#endif ++ ++ if (sk == NULL) ++ goto out; ++ ++ sock->ops = &ring_ops; ++ sock_init_data(sock, sk); ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) ++ sk_set_owner(sk, THIS_MODULE); ++#endif ++#endif ++ ++ err = -ENOMEM; ++ ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL)); ++ ++ if (!(pfr = ring_sk(sk))) { ++ sk_free(sk); ++ goto out; ++ } ++ memset(pfr, 0, sizeof(*pfr)); ++ init_waitqueue_head(&pfr->ring_slots_waitqueue); ++ pfr->ring_index_lock = RW_LOCK_UNLOCKED; ++ atomic_set(&pfr->num_ring_slots_waiters, 0); ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ sk->sk_family = PF_RING; ++ sk->sk_destruct = ring_sock_destruct; ++#else ++ sk->family = PF_RING; ++ sk->destruct = ring_sock_destruct; ++ sk->num = protocol; ++#endif ++ ++ ring_insert(sk); ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_create() - created\n"); ++#endif ++ ++ return(0); ++ out: ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_DEC_USE_COUNT; ++#endif ++ return err; ++} ++ ++/* *********************************************** */ ++ ++static int ring_release(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ struct ring_opt *pfr = ring_sk(sk); ++ ++ if(!sk) ++ return 0; ++ ++#if defined(RING_DEBUG) ++ printk("RING: called ring_release\n"); ++#endif ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_release entered\n"); ++#endif ++ ++ ring_remove(sk); ++ ++ sock_orphan(sk); ++ sock->sk = NULL; ++ ++ /* Free the ring buffer */ ++ if(pfr->ring_memory) { ++ struct page *page, *page_end; ++ ++ page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1); ++ for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++) ++ ClearPageReserved(page); ++ ++ free_pages(pfr->ring_memory, pfr->order); ++ } ++ ++ kfree(pfr); ++ ring_sk(sk) = NULL; ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ skb_queue_purge(&sk->sk_write_queue); ++#endif ++ sock_put(sk); ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_release leaving\n"); ++#endif ++ ++ return 0; ++} ++ ++/* ********************************** */ ++/* ++ * We create a ring for this socket and bind it to the specified device ++ */ ++static int packet_ring_bind(struct sock *sk, struct net_device *dev) ++{ ++ u_int the_slot_len; ++ u_int32_t tot_mem; ++ struct ring_opt *pfr = ring_sk(sk); ++ struct page *page, *page_end; ++ ++ if(!dev) return(-1); ++ ++#if defined(RING_DEBUG) ++ printk("RING: packet_ring_bind(%s) called\n", dev->name); ++#endif ++ ++ /* ********************************************** ++ ++ ************************************* ++ * * ++ * FlowSlotInfo * ++ * * ++ ************************************* <-+ ++ * FlowSlot * | ++ ************************************* | ++ * FlowSlot * | ++ ************************************* +- num_slots ++ * FlowSlot * | ++ ************************************* | ++ * FlowSlot * | ++ ************************************* <-+ ++ ++ ********************************************** */ ++ ++ the_slot_len = sizeof(u_char) /* flowSlot.slot_state */ ++ + sizeof(u_short) /* flowSlot.slot_len */ ++ + bucket_len /* flowSlot.bucket */; ++ ++ tot_mem = sizeof(FlowSlotInfo) + num_slots*the_slot_len; ++ ++ /* ++ Calculate the value of the order parameter used later. ++ See http://www.linuxjournal.com/article.php?sid=1133 ++ */ ++ for(pfr->order = 0;(PAGE_SIZE << pfr->order) < tot_mem; pfr->order++) ; ++ ++ /* ++ We now try to allocate the memory as required. If we fail ++ we try to allocate a smaller amount or memory (hence a ++ smaller ring). ++ */ ++ while((pfr->ring_memory = __get_free_pages(GFP_ATOMIC, pfr->order)) == 0) ++ if(pfr->order-- == 0) ++ break; ++ ++ if(pfr->order == 0) { ++#if defined(RING_DEBUG) ++ printk("ERROR: not enough memory\n"); ++#endif ++ return(-1); ++ } else { ++#if defined(RING_DEBUG) ++ printk("RING: succesfully allocated %lu KB [tot_mem=%d][order=%ld]\n", ++ PAGE_SIZE >> (10 - pfr->order), tot_mem, pfr->order); ++#endif ++ } ++ ++ tot_mem = PAGE_SIZE << pfr->order; ++ memset((char*)pfr->ring_memory, 0, tot_mem); ++ ++ /* Now we need to reserve the pages */ ++ page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1); ++ for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++) ++ SetPageReserved(page); ++ ++ pfr->slots_info = (FlowSlotInfo*)pfr->ring_memory; ++ pfr->ring_slots = (char*)(pfr->ring_memory+sizeof(FlowSlotInfo)); ++ ++ pfr->slots_info->version = RING_FLOWSLOT_VERSION; ++ pfr->slots_info->slot_len = the_slot_len; ++ pfr->slots_info->tot_slots = (tot_mem-sizeof(FlowSlotInfo))/the_slot_len; ++ pfr->slots_info->tot_mem = tot_mem; ++ pfr->slots_info->sample_rate = sample_rate; ++ ++#if defined(RING_DEBUG) ++ printk("RING: allocated %d slots [slot_len=%d][tot_mem=%u]\n", ++ pfr->slots_info->tot_slots, pfr->slots_info->slot_len, ++ pfr->slots_info->tot_mem); ++#endif ++ ++#ifdef RING_MAGIC ++ { ++ int i; ++ ++ for(i=0; islots_info->tot_slots; i++) { ++ unsigned long idx = i*pfr->slots_info->slot_len; ++ FlowSlot *slot = (FlowSlot*)&pfr->ring_slots[idx]; ++ slot->magic = RING_MAGIC_VALUE; slot->slot_state = 0; ++ } ++ } ++#endif ++ ++ pfr->insert_page_id = 1, pfr->insert_slot_id = 0; ++ ++ /* ++ IMPORTANT ++ Leave this statement here as last one. In fact when ++ the ring_netdev != NULL the socket is ready to be used. ++ */ ++ pfr->ring_netdev = dev; ++ ++ return(0); ++} ++ ++/* ************************************* */ ++ ++/* Bind to a device */ ++static int ring_bind(struct socket *sock, ++ struct sockaddr *sa, int addr_len) ++{ ++ struct sock *sk=sock->sk; ++ struct net_device *dev = NULL; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_bind() called\n"); ++#endif ++ ++ /* ++ * Check legality ++ */ ++ if (addr_len != sizeof(struct sockaddr)) ++ return -EINVAL; ++ if (sa->sa_family != PF_RING) ++ return -EINVAL; ++ ++ /* Safety check: add trailing zero if missing */ ++ sa->sa_data[sizeof(sa->sa_data)-1] = '\0'; ++ ++#if defined(RING_DEBUG) ++ printk("RING: searching device %s\n", sa->sa_data); ++#endif ++ ++ if((dev = __dev_get_by_name(sa->sa_data)) == NULL) { ++#if defined(RING_DEBUG) ++ printk("RING: search failed\n"); ++#endif ++ return(-EINVAL); ++ } else ++ return(packet_ring_bind(sk, dev)); ++} ++ ++/* ************************************* */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ ++volatile void* virt_to_kseg(volatile void* address) { ++ pte_t *pte; ++ pud_t *pud; ++ unsigned long addr = (unsigned long)address; ++ ++ pud = pud_offset(pgd_offset_k((unsigned long) address), ++ (unsigned long) address); ++ ++ /* ++ High-memory support courtesy of ++ Brad Doctor ++ */ ++#if defined(CONFIG_X86_PAE) && (!defined(CONFIG_NOHIGHMEM)) ++ pte = pte_offset_map(pmd_offset(pud, addr), addr); ++#else ++ pte = pmd_offset_map(pud, addr); ++#endif ++ ++ return((volatile void*)pte_page(*pte)); ++} ++ ++#else /* 2.4 */ ++ ++/* http://www.scs.ch/~frey/linux/memorymap.html */ ++volatile void *virt_to_kseg(volatile void *address) ++{ ++ pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; ++ unsigned long va, ret = 0UL; ++ ++ va=VMALLOC_VMADDR((unsigned long)address); ++ ++ /* get the page directory. Use the kernel memory map. */ ++ pgd = pgd_offset_k(va); ++ ++ /* check whether we found an entry */ ++ if (!pgd_none(*pgd)) ++ { ++ /* get the page middle directory */ ++ pmd = pmd_offset(pgd, va); ++ /* check whether we found an entry */ ++ if (!pmd_none(*pmd)) ++ { ++ /* get a pointer to the page table entry */ ++ ptep = pte_offset(pmd, va); ++ pte = *ptep; ++ /* check for a valid page */ ++ if (pte_present(pte)) ++ { ++ /* get the address the page is refering to */ ++ ret = (unsigned long)page_address(pte_page(pte)); ++ /* add the offset within the page to the page address */ ++ ret |= (va & (PAGE_SIZE -1)); ++ } ++ } ++ } ++ return((volatile void *)ret); ++} ++#endif ++ ++/* ************************************* */ ++ ++static int ring_mmap(struct file *file, ++ struct socket *sock, ++ struct vm_area_struct *vma) ++{ ++ struct sock *sk = sock->sk; ++ struct ring_opt *pfr = ring_sk(sk); ++ unsigned long size, start; ++ u_int pagesToMap; ++ char *ptr; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() called\n"); ++#endif ++ ++ if(pfr->ring_memory == 0) { ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() failed: mapping area to an unbound socket\n"); ++#endif ++ return -EINVAL; ++ } ++ ++ size = (unsigned long)(vma->vm_end-vma->vm_start); ++ ++ if(size % PAGE_SIZE) { ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() failed: len is not multiple of PAGE_SIZE\n"); ++#endif ++ return(-EINVAL); ++ } ++ ++ /* if userspace tries to mmap beyond end of our buffer, fail */ ++ if(size > pfr->slots_info->tot_mem) { ++#if defined(RING_DEBUG) ++ printk("proc_mmap() failed: area too large [%ld > %d]\n", size, pfr->slots_info->tot_mem); ++#endif ++ return(-EINVAL); ++ } ++ ++ pagesToMap = size/PAGE_SIZE; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() called. %d pages to map\n", pagesToMap); ++#endif ++ ++#if defined(RING_DEBUG) ++ printk("RING: mmap [slot_len=%d][tot_slots=%d] for ring on device %s\n", ++ pfr->slots_info->slot_len, pfr->slots_info->tot_slots, ++ pfr->ring_netdev->name); ++#endif ++ ++ /* we do not want to have this area swapped out, lock it */ ++ vma->vm_flags |= VM_LOCKED; ++ start = vma->vm_start; ++ ++ /* Ring slots start from page 1 (page 0 is reserved for FlowSlotInfo) */ ++ ptr = (char*)(start+PAGE_SIZE); ++ ++ if(remap_page_range( ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ vma, ++#endif ++ start, ++ __pa(pfr->ring_memory), ++ PAGE_SIZE*pagesToMap, vma->vm_page_prot)) { ++#if defined(RING_DEBUG) ++ printk("remap_page_range() failed\n"); ++#endif ++ return(-EAGAIN); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("proc_mmap(pagesToMap=%d): success.\n", pagesToMap); ++#endif ++ ++ return 0; ++} ++ ++/* ************************************* */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++static int ring_recvmsg(struct kiocb *iocb, struct socket *sock, ++ struct msghdr *msg, size_t len, int flags) ++#else ++ static int ring_recvmsg(struct socket *sock, struct msghdr *msg, int len, ++ int flags, struct scm_cookie *scm) ++#endif ++{ ++ FlowSlot* slot; ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ u_int32_t queued_pkts, num_loops = 0; ++ ++#if defined(RING_DEBUG) ++ printk("ring_recvmsg called\n"); ++#endif ++ ++ slot = get_remove_slot(pfr); ++ ++ while((queued_pkts = num_queued_pkts(pfr)) < MIN_QUEUED_PKTS) { ++ wait_event_interruptible(pfr->ring_slots_waitqueue, 1); ++ ++#if defined(RING_DEBUG) ++ printk("-> ring_recvmsg returning %d [queued_pkts=%d][num_loops=%d]\n", ++ slot->slot_state, queued_pkts, num_loops); ++#endif ++ ++ if(queued_pkts > 0) { ++ if(num_loops++ > MAX_QUEUE_LOOPS) ++ break; ++ } ++ } ++ ++#if defined(RING_DEBUG) ++ if(slot != NULL) ++ printk("ring_recvmsg is returning [queued_pkts=%d][num_loops=%d]\n", ++ queued_pkts, num_loops); ++#endif ++ ++ return(queued_pkts); ++} ++ ++/* ************************************* */ ++ ++unsigned int ring_poll(struct file * file, ++ struct socket *sock, poll_table *wait) ++{ ++ FlowSlot* slot; ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ ++#if defined(RING_DEBUG) ++ printk("poll called\n"); ++#endif ++ ++ slot = get_remove_slot(pfr); ++ ++ if((slot != NULL) && (slot->slot_state == 0)) ++ poll_wait(file, &pfr->ring_slots_waitqueue, wait); ++ ++#if defined(RING_DEBUG) ++ printk("poll returning %d\n", slot->slot_state); ++#endif ++ ++ if((slot != NULL) && (slot->slot_state == 1)) ++ return(POLLIN | POLLRDNORM); ++ else ++ return(0); ++} ++ ++/* ************************************* */ ++ ++int add_to_cluster_list(struct ring_cluster *el, ++ struct sock *sock) { ++ ++ if(el->num_cluster_elements == CLUSTER_LEN) ++ return(-1); /* Cluster full */ ++ ++ ring_sk_datatype(ring_sk(sock))->cluster_id = el->cluster_id; ++ el->sk[el->num_cluster_elements] = sock; ++ el->num_cluster_elements++; ++ return(0); ++} ++ ++/* ************************************* */ ++ ++int remove_from_cluster_list(struct ring_cluster *el, ++ struct sock *sock) { ++ int i, j; ++ ++ for(i=0; isk[i] == sock) { ++ el->num_cluster_elements--; ++ ++ if(el->num_cluster_elements > 0) { ++ /* The cluster contains other elements */ ++ for(j=i; jsk[j] = el->sk[j+1]; ++ ++ el->sk[CLUSTER_LEN-1] = NULL; ++ } else { ++ /* Empty cluster */ ++ memset(el->sk, 0, sizeof(el->sk)); ++ } ++ ++ return(0); ++ } ++ ++ return(-1); /* Not found */ ++} ++ ++/* ************************************* */ ++ ++static int remove_from_cluster(struct sock *sock, ++ struct ring_opt *pfr) ++{ ++ struct ring_cluster *el; ++ ++#if defined(RING_DEBUG) ++ printk("--> remove_from_cluster(%d)\n", pfr->cluster_id); ++#endif ++ ++ if(pfr->cluster_id == 0 /* 0 = No Cluster */) ++ return(0); /* Noting to do */ ++ ++ el = ring_cluster_list; ++ ++ while(el != NULL) { ++ if(el->cluster_id == pfr->cluster_id) { ++ return(remove_from_cluster_list(el, sock)); ++ } else ++ el = el->next; ++ } ++ ++ return(-EINVAL); /* Not found */ ++} ++ ++/* ************************************* */ ++ ++static int add_to_cluster(struct sock *sock, ++ struct ring_opt *pfr, ++ u_short cluster_id) ++{ ++ struct ring_cluster *el; ++ ++#ifndef RING_DEBUG ++ printk("--> add_to_cluster(%d)\n", cluster_id); ++#endif ++ ++ if(cluster_id == 0 /* 0 = No Cluster */) return(-EINVAL); ++ ++ if(pfr->cluster_id != 0) ++ remove_from_cluster(sock, pfr); ++ ++ el = ring_cluster_list; ++ ++ while(el != NULL) { ++ if(el->cluster_id == cluster_id) { ++ return(add_to_cluster_list(el, sock)); ++ } else ++ el = el->next; ++ } ++ ++ /* There's no existing cluster. We need to create one */ ++ if((el = kmalloc(sizeof(struct ring_cluster), GFP_KERNEL)) == NULL) ++ return(-ENOMEM); ++ ++ el->cluster_id = cluster_id; ++ el->num_cluster_elements = 1; ++ el->hashing_mode = cluster_per_flow; /* Default */ ++ el->hashing_id = 0; ++ ++ memset(el->sk, 0, sizeof(el->sk)); ++ el->sk[0] = sock; ++ el->next = ring_cluster_list; ++ ring_cluster_list = el; ++ pfr->cluster_id = cluster_id; ++ ++ return(0); /* 0 = OK */ ++} ++ ++/* ************************************* */ ++ ++/* Code taken/inspired from core/sock.c */ ++static int ring_setsockopt(struct socket *sock, ++ int level, int optname, ++ char *optval, int optlen) ++{ ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ int val, found, ret = 0; ++ u_int cluster_id; ++ char devName[8]; ++ ++ if((optleninsns, fprog.filter, fsize)) ++ break; ++ ++ filter->len = fprog.len; ++ ++ if(sk_chk_filter(filter->insns, filter->len) != 0) { ++ /* Bad filter specified */ ++ kfree(filter); ++ pfr->bpfFilter = NULL; ++ break; ++ } ++ ++ /* get the lock, set the filter, release the lock */ ++ write_lock(&ring_mgmt_lock); ++ pfr->bpfFilter = filter; ++ write_unlock(&ring_mgmt_lock); ++ } ++ ret = 0; ++ break; ++ ++ case SO_DETACH_FILTER: ++ write_lock(&ring_mgmt_lock); ++ found = 1; ++ if(pfr->bpfFilter != NULL) { ++ kfree(pfr->bpfFilter); ++ pfr->bpfFilter = NULL; ++ write_unlock(&ring_mgmt_lock); ++ break; ++ } ++ ret = -ENONET; ++ break; ++ ++ case SO_ADD_TO_CLUSTER: ++ if (optlen!=sizeof(val)) ++ return -EINVAL; ++ ++ if (copy_from_user(&cluster_id, optval, sizeof(cluster_id))) ++ return -EFAULT; ++ ++ write_lock(&ring_mgmt_lock); ++ ret = add_to_cluster(sock->sk, pfr, cluster_id); ++ write_unlock(&ring_mgmt_lock); ++ break; ++ ++ case SO_REMOVE_FROM_CLUSTER: ++ write_lock(&ring_mgmt_lock); ++ ret = remove_from_cluster(sock->sk, pfr); ++ write_unlock(&ring_mgmt_lock); ++ break; ++ ++ case SO_SET_REFLECTOR: ++ if(optlen >= (sizeof(devName)-1)) ++ return -EINVAL; ++ ++ if(optlen > 0) { ++ if(copy_from_user(devName, optval, optlen)) ++ return -EFAULT; ++ } ++ ++ devName[optlen] = '\0'; ++ ++#if defined(RING_DEBUG) ++ printk("+++ SO_SET_REFLECTOR(%s)\n", devName); ++#endif ++ ++ write_lock(&ring_mgmt_lock); ++ pfr->reflector_dev = dev_get_by_name(devName); ++ write_unlock(&ring_mgmt_lock); ++ ++#if defined(RING_DEBUG) ++ if(pfr->reflector_dev != NULL) ++ printk("SO_SET_REFLECTOR(%s): succeded\n", devName); ++ else ++ printk("SO_SET_REFLECTOR(%s): device unknown\n", devName); ++#endif ++ break; ++ ++ default: ++ found = 0; ++ break; ++ } ++ ++ if(found) ++ return(ret); ++ else ++ return(sock_setsockopt(sock, level, optname, optval, optlen)); ++} ++ ++/* ************************************* */ ++ ++static int ring_ioctl(struct socket *sock, ++ unsigned int cmd, unsigned long arg) ++{ ++ switch(cmd) ++ { ++ case SIOCGIFFLAGS: ++ case SIOCSIFFLAGS: ++ case SIOCGIFCONF: ++ case SIOCGIFMETRIC: ++ case SIOCSIFMETRIC: ++ case SIOCGIFMEM: ++ case SIOCSIFMEM: ++ case SIOCGIFMTU: ++ case SIOCSIFMTU: ++ case SIOCSIFLINK: ++ case SIOCGIFHWADDR: ++ case SIOCSIFHWADDR: ++ case SIOCSIFMAP: ++ case SIOCGIFMAP: ++ case SIOCSIFSLAVE: ++ case SIOCGIFSLAVE: ++ case SIOCGIFINDEX: ++ case SIOCGIFNAME: ++ case SIOCGIFCOUNT: ++ case SIOCSIFHWBROADCAST: ++ return(dev_ioctl(cmd,(void *) arg)); ++ ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return 0; ++} ++ ++/* ************************************* */ ++ ++static struct proto_ops ring_ops = { ++ .family = PF_RING, ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ .owner = THIS_MODULE, ++#endif ++ ++ /* Operations that make no sense on ring sockets. */ ++ .connect = sock_no_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = sock_no_accept, ++ .getname = sock_no_getname, ++ .listen = sock_no_listen, ++ .shutdown = sock_no_shutdown, ++ .sendpage = sock_no_sendpage, ++ .sendmsg = sock_no_sendmsg, ++ .getsockopt = sock_no_getsockopt, ++ ++ /* Now the operations that really occur. */ ++ .release = ring_release, ++ .bind = ring_bind, ++ .mmap = ring_mmap, ++ .poll = ring_poll, ++ .setsockopt = ring_setsockopt, ++ .ioctl = ring_ioctl, ++ .recvmsg = ring_recvmsg, ++}; ++ ++/* ************************************ */ ++ ++static struct net_proto_family ring_family_ops = { ++ .family = PF_RING, ++ .create = ring_create, ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ .owner = THIS_MODULE, ++#endif ++}; ++ ++// BD: API changed in 2.6.12, ref: ++// http://svn.clkao.org/svnweb/linux/revision/?rev=28201 ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++static struct proto ring_proto = { ++ .name = "PF_RING", ++ .owner = THIS_MODULE, ++ .obj_size = sizeof(struct sock), ++}; ++#endif ++ ++/* ************************************ */ ++ ++static void __exit ring_exit(void) ++{ ++ struct list_head *ptr; ++ struct ring_element *entry; ++ ++ for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ entry = list_entry(ptr, struct ring_element, list); ++ kfree(entry); ++ } ++ ++ while(ring_cluster_list != NULL) { ++ struct ring_cluster *next = ring_cluster_list->next; ++ kfree(ring_cluster_list); ++ ring_cluster_list = next; ++ } ++ ++ set_skb_ring_handler(NULL); ++ set_buffer_ring_handler(NULL); ++ sock_unregister(PF_RING); ++ ++ printk("PF_RING shut down.\n"); ++} ++ ++/* ************************************ */ ++ ++static int __init ring_init(void) ++{ ++ printk("Welcome to PF_RING %s\n(C) 2004 L.Deri \n", ++ RING_VERSION); ++ ++ INIT_LIST_HEAD(&ring_table); ++ ring_cluster_list = NULL; ++ ++ sock_register(&ring_family_ops); ++ ++ set_skb_ring_handler(skb_ring_handler); ++ set_buffer_ring_handler(buffer_ring_handler); ++ ++ if(get_buffer_ring_handler() != buffer_ring_handler) { ++ printk("PF_RING: set_buffer_ring_handler FAILED\n"); ++ ++ set_skb_ring_handler(NULL); ++ set_buffer_ring_handler(NULL); ++ sock_unregister(PF_RING); ++ return -1; ++ } else { ++ printk("PF_RING: bucket length %d bytes\n", bucket_len); ++ printk("PF_RING: ring slots %d\n", num_slots); ++ printk("PF_RING: sample rate %d [1=no sampling]\n", sample_rate); ++ printk("PF_RING: capture TX %s\n", ++ enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]"); ++ printk("PF_RING: transparent mode %s\n", ++ transparent_mode ? "Yes" : "No"); ++ ++ printk("PF_RING initialized correctly.\n"); ++ return 0; ++ } ++} ++ ++module_init(ring_init); ++module_exit(ring_exit); ++MODULE_LICENSE("GPL"); ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++MODULE_ALIAS_NETPROTO(PF_RING); ++#endif -- cgit v1.2.3