[PATCH] Inbound Connection Control mechanism: Prioritized Accept Queue

From: Sridhar Samudrala (samudrala@us.ibm.com)
Date: Fri Jul 27 2001 - 12:10:22 EST


The following patch provides a mechanism called Prioritized Accept Queues(PAQ)
to prioritize incoming connection requests on a socket based on the source/dest
ip addreses and ports.

For example, this feature can be used to guarantee low delay and high throughput
to preferred clients on a web server by assigning higher priority to connection
requests whose source ip address matches the ip address of the preferred clients.
It can also be used on a server hosting multiple websites each identified by its
own ip address. In this case the prioritization can be done based on the
destination ip address of the connection requests.

The documentation on HOWTO use this patch and the test results which show an
improvement in connection rate for higher priority classes can be found at our
project website.
        http://oss.software.ibm.com/qos

We would appreciate any comments or suggestions.

Thanks
Sridhar

---------------------------
Sridhar Samudrala
IBM Linux Technology Centre
samudrala@us.ibm.com

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
diff -urN -X dontdiff linux-2.4.6/Documentation/Configure.help linux-2.4.6-paq/Documentation/Configure.help
--- linux-2.4.6/Documentation/Configure.help Mon Jul 2 14:07:55 2001
+++ linux-2.4.6-paq/Documentation/Configure.help Thu Jul 5 16:34:05 2001
@@ -1955,6 +1955,14 @@
   If you want to compile it as a module, say M here and read
   Documentation/modules.txt. If unsure, say `N'.
 
+Prioritized Accept Queue (EXPERIMENTAL)
+CONFIG_PRIO_ACCEPTQ
+ When enabled, this option allows you to set priorities to incoming
+ connection requests using the rules created by the iptables MARK target
+ option. The nfmark field set by the rules is used as a priority value
+ when the connection is added to accept queue. The priority value can
+ range between 0-7 with 0 being the highest priority and 7 the lowest.
+
 Packet filtering
 CONFIG_IP_NF_FILTER
   Packet filtering defines a table `filter', which has a series of
diff -urN -X dontdiff linux-2.4.6/include/net/sock.h linux-2.4.6-paq/include/net/sock.h
--- linux-2.4.6/include/net/sock.h Tue Jul 3 15:44:12 2001
+++ linux-2.4.6-paq/include/net/sock.h Thu Jul 5 16:45:31 2001
@@ -239,6 +239,11 @@
 #define pppoe_relay proto.pppoe.relay
 #endif
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+/* Priorities range from 0-7 */
+#define MAX_ACCEPTQ_PRIO 7
+#endif
+
 /* This defines a selective acknowledgement block. */
 struct tcp_sack_block {
         __u32 start_seq;
@@ -409,7 +414,11 @@
 
         /* FIFO of established children */
         struct open_request *accept_queue;
+#ifdef CONFIG_PRIO_ACCEPTQ
+ struct open_request *accept_queue_tail[MAX_ACCEPTQ_PRIO];
+#else
         struct open_request *accept_queue_tail;
+#endif
 
         int write_pending; /* A write to socket waits to start. */
 
diff -urN -X dontdiff linux-2.4.6/include/net/tcp.h linux-2.4.6-paq/include/net/tcp.h
--- linux-2.4.6/include/net/tcp.h Tue Jul 3 15:44:20 2001
+++ linux-2.4.6-paq/include/net/tcp.h Thu Jul 5 16:49:18 2001
@@ -519,6 +519,9 @@
                 struct tcp_v6_open_req v6_req;
 #endif
         } af;
+#ifdef CONFIG_PRIO_ACCEPTQ
+ int acceptq_prio;
+#endif
 };
 
 /* SLAB cache for open requests. */
@@ -1566,10 +1569,33 @@
                                          struct sock *child)
 {
         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+#ifdef CONFIG_PRIO_ACCEPTQ
+ int prio = req->acceptq_prio;
+ int prev_prio;
+#endif
 
         req->sk = child;
         tcp_acceptq_added(sk);
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+ if (!tp->accept_queue_tail[prio]) {
+ for (prev_prio = prio - 1; prev_prio >= 0; prev_prio--)
+ if (tp->accept_queue_tail[prev_prio])
+ break;
+ tp->accept_queue_tail[prio] = req;
+ if (prev_prio >= 0) {
+ req->dl_next = tp->accept_queue_tail[prev_prio]->dl_next;
+ tp->accept_queue_tail[prev_prio]->dl_next = req;
+ } else {
+ req->dl_next = tp->accept_queue;
+ tp->accept_queue = req;
+ }
+ } else {
+ req->dl_next = tp->accept_queue_tail[prio]->dl_next;
+ tp->accept_queue_tail[prio]->dl_next = req;
+ tp->accept_queue_tail[prio] = req;
+ }
+#else
         if (!tp->accept_queue_tail) {
                 tp->accept_queue = req;
         } else {
@@ -1577,6 +1603,7 @@
         }
         tp->accept_queue_tail = req;
         req->dl_next = NULL;
+#endif
 }
 
 struct tcp_listen_opt
@@ -1643,6 +1670,10 @@
                                         struct tcp_opt *tp,
                                         struct sk_buff *skb)
 {
+#ifdef CONFIG_PRIO_ACCEPTQ
+ int nfmark = (int)skb->nfmark;
+#endif
+
         req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
         req->rcv_isn = TCP_SKB_CB(skb)->seq;
         req->mss = tp->mss_clamp;
@@ -1654,6 +1685,9 @@
         req->acked = 0;
         req->ecn_ok = 0;
         req->rmt_port = skb->h.th->source;
+#ifdef CONFIG_PRIO_ACCEPTQ
+ req->acceptq_prio = (nfmark < 0) ? 0 : ((nfmark > MAX_ACCEPTQ_PRIO) ? MAX_ACCEPTQ_PRIO : nfmark);
+#endif
 }
 
 #define TCP_MEM_QUANTUM ((int)PAGE_SIZE)
diff -urN -X dontdiff linux-2.4.6/net/ipv4/netfilter/Config.in linux-2.4.6-paq/net/ipv4/netfilter/Config.in
--- linux-2.4.6/net/ipv4/netfilter/Config.in Tue Mar 6 22:44:16 2001
+++ linux-2.4.6-paq/net/ipv4/netfilter/Config.in Thu Jul 5 16:34:05 2001
@@ -27,6 +27,7 @@
   if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
     dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_UNCLEAN $CONFIG_IP_NF_IPTABLES
     dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_OWNER $CONFIG_IP_NF_IPTABLES
+ bool ' Prioritized Accept Queues (EXPERIMENTAL)' CONFIG_PRIO_ACCEPTQ
   fi
 # The targets
   dep_tristate ' Packet filtering' CONFIG_IP_NF_FILTER $CONFIG_IP_NF_IPTABLES
diff -urN -X dontdiff linux-2.4.6/net/ipv4/tcp.c linux-2.4.6-paq/net/ipv4/tcp.c
--- linux-2.4.6/net/ipv4/tcp.c Wed May 16 10:31:27 2001
+++ linux-2.4.6-paq/net/ipv4/tcp.c Thu Jul 5 16:34:05 2001
@@ -529,7 +529,12 @@
 
         sk->max_ack_backlog = 0;
         sk->ack_backlog = 0;
+#ifdef CONFIG_PRIO_ACCEPTQ
+ tp->accept_queue = NULL;
+ memset(tp->accept_queue_tail, 0, (sizeof(struct open_request *) * (MAX_ACCEPTQ_PRIO + 1)));
+#else
         tp->accept_queue = tp->accept_queue_tail = NULL;
+#endif
         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
         tcp_delack_init(tp);
 
@@ -588,7 +593,12 @@
         write_lock_bh(&tp->syn_wait_lock);
         tp->listen_opt =NULL;
         write_unlock_bh(&tp->syn_wait_lock);
+#ifdef CONFIG_PRIO_ACCEPTQ
+ tp->accept_queue = NULL;
+ memset(tp->accept_queue_tail, 0, (sizeof(struct open_request *) * (MAX_ACCEPTQ_PRIO + 1)));
+#else
         tp->accept_queue = tp->accept_queue_tail = NULL;
+#endif
 
         if (lopt->qlen) {
                 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
@@ -2109,6 +2119,9 @@
         struct open_request *req;
         struct sock *newsk;
         int error;
+#ifdef CONFIG_PRIO_ACCEPTQ
+ int prio;
+#endif
 
         lock_sock(sk);
 
@@ -2134,8 +2147,17 @@
         }
 
         req = tp->accept_queue;
+#ifdef CONFIG_PRIO_ACCEPTQ
+ tp->accept_queue = req->dl_next;
+ for (prio = 0; prio <= MAX_ACCEPTQ_PRIO; prio++)
+ if (req == tp->accept_queue_tail[prio]) {
+ tp->accept_queue_tail[prio] = NULL;
+ break;
+ }
+#else
         if ((tp->accept_queue = req->dl_next) == NULL)
                 tp->accept_queue_tail = NULL;
+#endif
 
          newsk = req->sk;
         tcp_acceptq_removed(sk);
diff -urN -X dontdiff linux-2.4.6/net/ipv4/tcp_minisocks.c linux-2.4.6-paq/net/ipv4/tcp_minisocks.c
--- linux-2.4.6/net/ipv4/tcp_minisocks.c Thu Apr 12 12:11:39 2001
+++ linux-2.4.6-paq/net/ipv4/tcp_minisocks.c Thu Jul 5 16:34:05 2001
@@ -733,7 +733,12 @@
                 newtp->num_sacks = 0;
                 newtp->urg_data = 0;
                 newtp->listen_opt = NULL;
+#ifdef CONFIG_PRIO_ACCEPTQ
+ newtp->accept_queue = NULL;
+ memset(newtp->accept_queue_tail, 0, (sizeof(struct open_request *) * (MAX_ACCEPTQ_PRIO + 1)));
+#else
                 newtp->accept_queue = newtp->accept_queue_tail = NULL;
+#endif
                 /* Deinitialize syn_wait_lock to trap illegal accesses. */
                 memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Tue Jul 31 2001 - 21:00:33 EST