diff -urN ns-2.31-orig/common/scheduler.cc ns-2.31-linux/common/scheduler.cc --- ns-2.31-orig/common/scheduler.cc 2006-02-21 07:20:18.000000000 -0800 +++ ns-2.31-linux/common/scheduler.cc 2007-07-18 15:35:42.000000000 -0700 @@ -34,6 +34,16 @@ * @(#) $Header: /cvsroot/nsnam/ns-2/common/scheduler.cc,v 1.74 2006/02/21 15:20:18 mahrenho Exp $ */ +/* Sep 2005: David Wei (weixl@caltech.edu): + * 1. change the Calendar Queue to be SNOOPy Calendar Queue to improve simulation speed + * See related paper for SNOOPy queue: + * Kah Leong Tan, Li-Jin Thng, "SNOOPy Calendar Queue", + * Proceedings of the 32nd conference on Winter simulation + * Orlando, Florida, Pages: 487 - 495, Year of Publication: 2000, ISBN:1-23456-789-0 + * 2. use average intervals of dequeued events instead of average interval of events in the most crowded + * bucket as the estimation of bucket width + */ + #ifndef lint static const char rcsid[] = "@(#) $Header: /cvsroot/nsnam/ns-2/common/scheduler.cc,v 1.74 2006/02/21 15:20:18 mahrenho Exp $ (LBL)"; @@ -581,6 +591,14 @@ } class_calendar_sched; CalendarScheduler::CalendarScheduler() : cal_clock_(clock_) { +#ifdef CALENDAR_NEW_WIDTH + avg_gap_ = -2; //weixl for improving width calculation + last_time_ = -2; //weixl for improving width calculation + gap_num_ = 0; + head_search_ = 0; + insert_search_ = 0; + round_num_ = 0; +#endif reinit(4, 1.0, cal_clock_); } @@ -595,47 +613,46 @@ CalendarScheduler::insert(Event* e) { int i; - if (cal_clock_ > e->time_) { + double newtime = e->time_; + if (cal_clock_ > newtime) { // may happen in RT scheduler - cal_clock_ = e->time_; + cal_clock_ = newtime; i = lastbucket_ = CALENDAR_HASH(cal_clock_); } else - i = CALENDAR_HASH(e->time_); + i = CALENDAR_HASH(newtime); - Event *head = buckets_[i].list_; - Event *before=0; + Bucket* current=(&buckets_[i]); + Event *head = current->list_; + Event *after=0; if (!head) { - buckets_[i].list_ = e; + current->list_ = e; e->next_ = e->prev_ = e; ++stat_qsize_; - ++buckets_[i].count_; + ++(current->count_); } else { - bool newhead; - if (e->time_ >= head->prev_->time_) { - // insert at the tail - before = head; - newhead = false; + insert_search_++; + if (newtime < head->time_) { + // e-> head -> ... + e->next_ = head; + e->prev_ = head->prev_; + e->prev_->next_ = e; + head->prev_ = e; + current->list_ = e; + ++stat_qsize_; + ++(current->count_); } else { - // insert event in time sorted order, FIFO for sim-time events - for (before = head; e->time_ >= before->time_; before = before->next_) - ; - newhead = (before == head); - } - - e->next_ = before; - e->prev_ = before->prev_; - before->prev_ = e; - e->prev_->next_ = e; - if (newhead) { - buckets_[i].list_ = e; - //assert(e->time_ <= e->next_->time_); - } - //assert(e->prev_ != e); - if (e->prev_->time_ != e->time_) { - // unique timing - ++stat_qsize_; - ++buckets_[i].count_; + for (after = head->prev_; newtime < after->time_; after = after->prev_) { insert_search_++; }; + //...-> after -> e -> ... + e->next_ = after->next_; + e->prev_ = after; + e->next_->prev_ = e; + after->next_ = e; + if (after->time_ < newtime) { + //unique timing + ++stat_qsize_; + ++(current->count_); + } } } ++qsize_; @@ -710,6 +727,7 @@ Event *e, *min_e = NULL; #define CAL_DEQUEUE(x) \ do { \ + head_search_++; \ if ((e = buckets_[i].list_) != NULL) { \ diff = e->time_ - cal_clock_; \ if (diff < diff##x##_) { \ @@ -768,6 +786,35 @@ if (!e) return 0; +//weixl for improving width calculation +#ifdef CALENDAR_NEW_WIDTH + if (last_time_< 0) last_time_ = e->time_; + else + { + gap_num_ ++; + if (gap_num_ >= qsize_ ) { + double tt_gap_ = e->time_ - last_time_; + avg_gap_ = tt_gap_ / gap_num_; + gap_num_ = 0; + last_time_ = e->time_; + round_num_ ++; + if ((round_num_ > 20) && + (( head_search_> (insert_search_<<1)) + ||( insert_search_> (head_search_<<1)) )) + { + resize(nbuckets_, cal_clock_); + round_num_ = 0; + } else { + if (round_num_ > 100) { + round_num_ = 0; + head_search_ = 0; + insert_search_ = 0; + } + } + } + }; +#endif +//weixl done int l = lastbucket_; @@ -828,10 +875,19 @@ void CalendarScheduler::resize(int newsize, double start) { - double bwidth = newwidth(newsize); - - if (newsize < 4) - newsize = 4; + double bwidth; + if (newsize == nbuckets_) { + /* we resize for bwidth*/ + if (head_search_) bwidth = head_search_; else bwidth = 1; + if (insert_search_) bwidth = bwidth / insert_search_; + bwidth = sqrt (bwidth) * width_; + //snoopy queue calculation + } else { + /* we resize for size */ + bwidth = newwidth(newsize); + if (newsize < 4) + newsize = 4; + } Bucket *oldb = buckets_; int oldn = nbuckets_; @@ -856,13 +912,21 @@ } while (e != tail); } } - delete [] oldb; + head_search_ = 0; + insert_search_ = 0; + round_num_ = 0; + delete [] oldb; } // take samples from the most populated bucket. double CalendarScheduler::newwidth(int newsize) { +//by weixl for better width calculation +#ifdef CALENDAR_NEW_WIDTH + if (avg_gap_ > 0) return avg_gap_*4.0; +#endif +//weixl done int i; int max_bucket = 0; // index of the fullest bucket for (i = 1; i < nbuckets_; ++i) { diff -urN ns-2.31-orig/common/scheduler.h ns-2.31-linux/common/scheduler.h --- ns-2.31-orig/common/scheduler.h 2005-07-26 18:13:42.000000000 -0700 +++ ns-2.31-linux/common/scheduler.h 2007-07-18 15:35:42.000000000 -0700 @@ -34,6 +34,12 @@ * @(#) $Header: /cvsroot/nsnam/ns-2/common/scheduler.h,v 1.27 2005/07/27 01:13:42 tomh Exp $ (LBL) */ +/* Sep 2005: David Wei (weixl@caltech.edu): change on Calendar Queue. See .cc file for details */ + +#ifndef CALENDAR_NEW_WIDTH +#define CALENDAR_NEW_WIDTH 1 +#endif + #ifndef ns_scheduler_h #define ns_scheduler_h @@ -154,6 +160,15 @@ const Event* head(); protected: +#ifdef CALENDAR_NEW_WIDTH +/* weixl for improvign width calculation*/ + long unsigned head_search_; + long unsigned insert_search_; + int round_num_; + long int gap_num_; //the number of gap samples in this window (in process of calculation) + double last_time_; //the departure time of first event in this window + double avg_gap_; //the average gap in last window (finished calculation) +#endif double width_; double diff0_, diff1_, diff2_; /* wrap-around checks */ diff -urN ns-2.31-orig/Makefile.in ns-2.31-linux/Makefile.in --- ns-2.31-orig/Makefile.in 2007-03-10 15:49:57.000000000 -0800 +++ ns-2.31-linux/Makefile.in 2007-07-18 15:43:36.000000000 -0700 @@ -188,6 +188,12 @@ tcp/tcp-vegas.o tcp/tcp-rbp.o tcp/tcp-full.o tcp/rq.o \ baytcp/tcp-full-bay.o baytcp/ftpc.o baytcp/ftps.o \ tcp/scoreboard.o tcp/scoreboard-rq.o tcp/tcp-sack1.o tcp/tcp-fack.o \ + tcp/scoreboard1.o tcp/tcp-linux.o tcp/linux/ns-linux-util.o \ + tcp/linux/ns-linux-c.o tcp/linux/tcp_cong.o tcp/linux/tcp_naivereno.o \ + tcp/linux/tcp_highspeed.o tcp/linux/tcp_bic.o tcp/linux/tcp_htcp.o \ + tcp/linux/tcp_scalable.o tcp/linux/tcp_westwood.o \ + tcp/linux/tcp_vegas.o tcp/linux/tcp_hybla.o tcp/linux/tcp_cubic.o \ + tcp/linux/tcp_veno.o tcp/linux/tcp_compound.o tcp/linux/tcp_lp.o \ tcp/tcp-asym.o tcp/tcp-asym-sink.o tcp/tcp-fs.o \ tcp/tcp-asym-fs.o \ tcp/tcp-int.o tcp/chost.o tcp/tcp-session.o \ diff -urN ns-2.31-orig/tcl/lib/ns-default.tcl ns-2.31-linux/tcl/lib/ns-default.tcl --- ns-2.31-orig/tcl/lib/ns-default.tcl 2006-10-22 22:33:16.000000000 -0700 +++ ns-2.31-linux/tcl/lib/ns-default.tcl 2007-07-18 15:35:46.000000000 -0700 @@ -1360,3 +1360,12 @@ # Quick Start definitions end here Delayer set debug_ false + +Agent/TCP/Linux set rtxcur_init_ 3 +Agent/TCP/Linux set maxrto_ 120 +Agent/TCP/Linux set minrto_ 0.2 +Agent/TCP/Linux set ts_resetRTO_ true +Agent/TCP/Linux set next_pkts_in_flight_ 0 +Agent/TCP/Linux set delay_growth_ false + + diff -urN ns-2.31-orig/tcp/linux/ns-linux-c.c ns-2.31-linux/tcp/linux/ns-linux-c.c --- ns-2.31-orig/tcp/linux/ns-linux-c.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/ns-linux-c.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,57 @@ +/* + * TCP-Linux module for NS2 + * + * May 2006 + * + * Author: Xiaoliang (David) Wei (DavidWei@acm.org) + * + * NetLab, the California Institute of Technology + * http://netlab.caltech.edu + * + * Module: linux/ns-linux-c.c + * This is the utilities of shortcuts for Linux source codes (in C) + * We shortcut most of the Linux system calls which are not related to congestion control. + * + * See a mini-tutorial about TCP-Linux at: http://www.cs.caltech.edu/~weixl/ns2.html + * + */ + +#include "ns-linux-util.h" +#include "ns-linux-c.h" +int fls(int x) +{ + int r = 32; + + if (!x) + return 0; + if (!(x & 0xffff0000u)) { + x <<= 16; + r -= 16; + } + if (!(x & 0xff000000u)) { + x <<= 8; + r -= 8; + } + if (!(x & 0xf0000000u)) { + x <<= 4; + r -= 4; + } + if (!(x & 0xc0000000u)) { + x <<= 2; + r -= 2; + } + if (!(x & 0x80000000u)) { + x <<= 1; + r -= 1; + } + return r; +} + +int fls64(__u64 x) +{ + __u32 h = x >> 32; + if (h) + return fls(h) + 32; + return fls(x); +} + diff -urN ns-2.31-orig/tcp/linux/ns-linux-c.h ns-2.31-linux/tcp/linux/ns-linux-c.h --- ns-2.31-orig/tcp/linux/ns-linux-c.h 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/ns-linux-c.h 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,124 @@ +/* + * TCP-Linux module for NS2 + * + * May 2006 + * + * Author: Xiaoliang (David) Wei (DavidWei@acm.org) + * + * NetLab, the California Institute of Technology + * http://netlab.caltech.edu + * + * Module: linux/ns-linux-c.h + * This is the header file of shortcuts for Linux source codes (in C) + * We shortcut most of the Linux system calls which are not related to congestion control. + * + * See a mini-tutorial about TCP-Linux at: http://www.cs.caltech.edu/~weixl/ns2.html + * + */ + +#ifndef NS_LINUX_C_H +#define NS_LINUX_C_H + +#include "ns-linux-util.h" +//For sharing Reno + +extern u32 tcp_reno_ssthresh(struct sock *sk); +extern void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag); + +#define tcp_is_cwnd_limited(sk, in_flight) (in_flight >= sk->snd_cwnd) +//from kernel.h + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#define min_t(type,x,y) \ + (((type)x) < ((type)y) ? ((type)x): ((type)y)) + +#define max_t(type,x,y) \ + (((type)x) > ((type)y) ? ((type)x): ((type)y)) + +#define min(x,y) ((xy)?x:y) +#define after(seq1,seq2) (seq2= seq1 - seq2) + +////////////For delay based protocols: +struct tcpvegas_info { + __u32 tcpv_enabled; + __u32 tcpv_rttcnt; + __u32 tcpv_rtt; + __u32 tcpv_minrtt; +}; +#define INET_DIAG_VEGASINFO 1 +struct rtattr {}; +#define __RTA_PUT(skb, INFO_FLAG, size) NULL +#define RTA_DATA(rta) NULL + +#define DEFINE_SPINLOCK(x) +#define LIST_HEAD(x) + + + +////////////For bit operations From include/linux/bitops.h ///////// +/* + * fls: find last bit set. + */ + +extern int fls(int x); +extern int fls64(__u64 x); + +///// For 64 bit division from include/asm-generic/div64.h //// +#define do_div(n,base) ({ \ + uint32_t __base = (base); \ + uint32_t __rem; \ + __rem = ((uint64_t)(n)) % __base; \ + (n) = ((uint64_t)(n)) / __base; \ + __rem; \ + }) +#endif diff -urN ns-2.31-orig/tcp/linux/ns-linux-util.cc ns-2.31-linux/tcp/linux/ns-linux-util.cc --- ns-2.31-orig/tcp/linux/ns-linux-util.cc 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/ns-linux-util.cc 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,27 @@ +/* + * TCP-Linux module for NS2 + * + * May 2006 + * + * Author: Xiaoliang (David) Wei (DavidWei@acm.org) + * + * NetLab, the California Institute of Technology + * http://netlab.caltech.edu + * + * Module: linux/ns-linux-util.cc + * This is the utilities for linkages between NS-2 source codes (in C++) and Linux source codes (in C) + * + * + * See a mini-tutorial about TCP-Linux at: http://www.cs.caltech.edu/~weixl/ns2.html + * + */ + +#include "ns-linux-util.h" +__u32 tcp_time_stamp=0; +unsigned char sysctl_tcp_abc=1; +unsigned char tcp_max_burst=3; + +void tcp_cong_avoid_register(void) { +//Please put all the necessary register function here. +} + diff -urN ns-2.31-orig/tcp/linux/ns-linux-util.h ns-2.31-linux/tcp/linux/ns-linux-util.h --- ns-2.31-orig/tcp/linux/ns-linux-util.h 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/ns-linux-util.h 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,313 @@ +/* + * TCP-Linux module for NS2 + * + * May 2006 + * + * Author: Xiaoliang (David) Wei (DavidWei@acm.org) + * + * NetLab, the California Institute of Technology + * http://netlab.caltech.edu + * + * Module: linux/ns-linux-util.h + * This is the header file for linkages between NS-2 source codes (in C++) and Linux source codes (in C) + * + * See a mini-tutorial about TCP-Linux at: http://www.cs.caltech.edu/~weixl/ns2.html + * + */ + +#ifndef NS_LINUX_UTIL_H +#define NS_LINUX_UTIL_H + +/* Please list all the new protocols here: */ +extern struct tcp_congestion_ops tcp_highspeed; +extern struct tcp_congestion_ops tcp_reno; +extern struct tcp_congestion_ops bictcp; +extern struct tcp_congestion_ops htcp; +extern struct tcp_congestion_ops tcp_scalable; +extern struct tcp_congestion_ops tcp_westwood; +extern struct tcp_congestion_ops tcp_vegas; +extern struct tcp_congestion_ops tcp_hybla; +extern struct tcp_congestion_ops cubictcp; +extern struct tcp_congestion_ops naive_reno; +extern struct tcp_congestion_ops tcp_veno; +extern struct tcp_congestion_ops tcp_compound; +extern struct tcp_congestion_ops tcp_lp; +#define JIFFY_RATIO 1000 +#define US_RATIO 1000000 +#define MS_RATIO 1000 + +#define jiffies_to_usecs(x) ((US_RATIO/JIFFY_RATIO)*x) +#define msecs_to_jiffies(x) ((JIFFY_RATIO/MS_RATIO)*x) + +extern void tcp_cong_avoid_register(void); + +#define __u64 unsigned long long +#define __u32 unsigned long +#define __u16 unsigned int +#define __u8 unsigned char + +#define u64 __u64 +#define u32 __u32 +#define u16 __u16 +#define u8 __u8 + +#define s32 long +#define s64 long long + +#define u_int64_t u64 +#define u_int32_t u32 +#define uint32_t u32 +#define uint64_t u64 + +#define inet_csk(sk) sk +#define tcp_sk(sk) sk +#define inet_csk_ca(sk) (sk->icsk_ca_priv) + + +//from kernel.h +#define min_t(type,x,y) \ + (((type)x) < ((type)y) ? ((type)x): ((type)y)) + +//#define max(x,y) ((x>y)? x:y) + +/* Events passed to congestion control interface */ +enum tcp_ca_event { + CA_EVENT_TX_START, /* first transmit when no packets in flight */ + CA_EVENT_CWND_RESTART, /* congestion window restart */ + CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ + CA_EVENT_FRTO, /* fast recovery timeout */ + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_FAST_ACK, /* in sequence ack */ + CA_EVENT_SLOW_ACK, /* other ack */ +}; + +#define sock tcp_sock +#define inet_sock tcp_sock +#define inet_connection_sock tcp_sock +struct sk_buff; +struct sock; +/* + * Interface for adding new TCP congestion control handlers + */ +#define TCP_CA_NAME_MAX 16 +struct tcp_congestion_ops { +// struct list_head list; + + /* initialize private data (optional) */ + void (*init)(struct sock *sk); + /* cleanup private data (optional) */ + void (*release)(struct sock *sk); + + /* return slow start threshold (required) */ + u32 (*ssthresh)(struct sock *sk); + /* lower bound for congestion window (optional) */ + u32 (*min_cwnd)(struct sock *sk); + /* do new cwnd calculation (required) */ + void (*cong_avoid)(struct sock *sk, u32 ack, + u32 rtt, u32 in_flight, int good_ack); + /* round trip time sample per acked packet (optional) */ + void (*rtt_sample)(struct sock *sk, u32 usrtt); + /* call before changing ca_state (optional) */ + void (*set_state)(struct sock *sk, u8 new_state); + /* call when cwnd event occurs (optional) */ + void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* new value of cwnd after loss (optional) */ + u32 (*undo_cwnd)(struct sock *sk); + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, u32 num_acked); + /* get info for inet_diag (optional) */ + void (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb); + + char name[TCP_CA_NAME_MAX]; + struct module *owner; +}; + +struct tcp_options_received { +/* PAWS/RTTM data */ +// long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ +// __u32 ts_recent; /* Time stamp to echo next */ + __u32 rcv_tsval; /* Time stamp value */ + __u32 rcv_tsecr; /* Time stamp echo reply */ + __u16 saw_tstamp : 1, /* Saw TIMESTAMP on last packet */ + dump_xxx: 15; +// tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */ +// dsack : 1, /* D-SACK is scheduled */ +// wscale_ok : 1, /* Wscale seen on SYN packet */ +// sack_ok : 4, /* SACK seen on SYN packet */ +// snd_wscale : 4, /* Window scaling received from sender */ +// rcv_wscale : 4; /* Window scaling to send to receiver */ +/* SACKs data */ +// __u8 eff_sacks; /* Size of SACK array to send with next packet */ +// __u8 num_sacks; /* Number of SACK blocks */ +/// __u16 user_mss; /* mss requested by user in ioctl */ +// __u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ +}; + +struct tcp_sock { +/* inet_connection_sock has to be the first member of tcp_sock */ +// struct inet_connection_sock inet_conn; +// int tcp_header_len; /* Bytes of tcp header to send */ + +/* + * Header prediction flags + * 0x5?10 << 16 + snd_wnd in net byte order + */ +// __u32 pred_flags; + +/* + * RFC793 variables by their proper names. This means you can + * read the code and the spec side by side (and laugh ...) + * See RFC793 and RFC1122. The RFC writes these in capitals. + */ +// __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + + __u32 snd_una; /* First byte we want an ack for */ +// __u32 snd_sml; /* Last byte of the most recently transmitted small packet */ +// __u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ +// __u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ + + /* Data for direct copy to user */ +// struct { +// struct sk_buff_head prequeue; +// struct task_struct *task; +// struct iovec *iov; +// int memory; +// int len; +// } ucopy; + +// __u32 snd_wl1; /* Sequence for window update */ +// __u32 snd_wnd; /* The window we expect to receive */ +// __u32 max_window; /* Maximal window ever seen from peer */ +// __u32 pmtu_cookie; /* Last pmtu seen by socket */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ +// __u16 xmit_size_goal; /* Goal for segmenting output packets */ +// __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ +// +// __u32 window_clamp; /* Maximal window to advertise */ +// __u32 rcv_ssthresh; /* Current window clamp */ +// +// __u32 frto_highmark; /* snd_nxt when RTO occurred */ +// __u8 reordering; /* Packet reordering metric. */ +// __u8 frto_counter; /* Number of new acks after RTO */ +// __u8 nonagle; /* Disable Nagle algorithm? */ +// __u8 keepalive_probes; /* num of allowed keep alive probes */ + +/* RTT measurement */ + __u32 srtt; /* smoothed round trip time << 3 */ +// __u32 mdev; /* medium deviation */ +// __u32 mdev_max; /* maximal mdev for the last rtt period */ +// __u32 rttvar; /* smoothed mdev_max */ +// __u32 rtt_seq; /* sequence number to update rttvar */ +// +// __u32 packets_out; /* Packets which are "in flight" */ +// __u32 left_out; /* Packets which leaved network */ +// __u32 retrans_out; /* Retransmitted packets out */ +/* + * Options received (usually on last packet, some only on SYN packets). + */ + struct tcp_options_received rx_opt; + +/* + * Slow start and congestion control (see also Nagle, and Karn & Partridge) + */ + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 snd_cwnd; /* Sending congestion window */ + __u16 snd_cwnd_cnt; /* Linear increase counter */ + __u16 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ +// __u32 snd_cwnd_used; + __u32 snd_cwnd_stamp; + __u32 bytes_acked; +// +// struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ +// +// struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ +// + // __u32 rcv_wnd; /* Current receiver window */ +// __u32 rcv_wup; /* rcv_nxt on last window update sent */ +// __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ +// __u32 pushed_seq; /* Last pushed seq, required to talk to windows */ +// __u32 copied_seq; /* Head of yet unread data */ +// +/* SACKs data */ +// struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ +// struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ + +// __u16 advmss; /* Advertised MSS */ +// __u16 prior_ssthresh; /* ssthresh saved at recovery start */ +// __u32 lost_out; /* Lost packets */ +// __u32 sacked_out; /* SACK'd packets */ +// __u32 fackets_out; /* FACK'd packets */ +// __u32 high_seq; /* snd_nxt at onset of congestion */ +// +// __u32 retrans_stamp; /* Timestamp of the last retransmit, +// * also used in SYN-SENT to remember stamp of +// * the first SYN. */ +// __u32 undo_marker; /* tracking retrans started here. */ +// int undo_retrans; /* number of undoable retransmissions. */ +// __u32 urg_seq; /* Seq of received urgent pointer */ +// __u16 urg_data; /* Saved octet of OOB data and control flags */ +// __u8 urg_mode; /* In urgent mode */ +// __u8 ecn_flags; /* ECN status bits. */ +// __u32 snd_up; /* Urgent pointer */ +// +// __u32 total_retrans; /* Total retransmits for entire connection */ +// +// unsigned int keepalive_time; /* time before keep alive takes place */ +// unsigned int keepalive_intvl; /* time interval between keep alive probes */ +// int linger2; +// +// unsigned long last_synq_overflow; +// +/* Receiver side RTT estimation */ +// struct { +// __u32 rtt; +// __u32 seq; +// __u32 time; +// } rcv_rtt_est; + +/* Receiver queue space */ +// struct { +// int space; +// __u32 seq; +// __u32 time; +// } rcvq_space; + struct tcp_congestion_ops *icsk_ca_ops; + __u8 icsk_ca_state; + u32 icsk_ca_priv[16]; +#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) +}; +extern unsigned long tcp_time_stamp; +extern unsigned char sysctl_tcp_abc; +extern unsigned char tcp_max_burst; + +struct sk_buff { + +}; + +extern struct tcp_congestion_ops tcp_init_congestion_ops; + + +extern u32 tcp_reno_min_cwnd(struct sock *sk); + +enum tcp_ca_state +{ + TCP_CA_Open = 0, +#define TCPF_CA_Open (1< +#include +#include +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ + +static int fast_convergence = 1; +static int max_increment = 16; +static int low_window = 14; +static int bbeta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int initial_ssthresh = 100; +static int smooth_part = 20; + +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(max_increment, int, 0644); +MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); +module_param(low_window, int, 0644); +MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); +module_param(bbeta, int, 0644); +MODULE_PARM_DESC(bbeta, "beta for multiplicative increase"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(smooth_part, int, 0644); +MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); + + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 epoch_start; /* beginning of an epoch */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; +} + +static void bictcp_init(struct sock *sk) +{ + bictcp_reset(inet_csk_ca(sk)); + if (initial_ssthresh) + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) /* record the beginning of an epoch */ + ca->epoch_start = tcp_time_stamp; + + /* start off normal */ + if (cwnd <= low_window) { + ca->cnt = cwnd; + return; + } + + /* binary increase */ + if (cwnd < ca->last_max_cwnd) { + __u32 dist = (ca->last_max_cwnd - cwnd) + / BICTCP_B; + + if (dist > max_increment) + /* linear increase */ + ca->cnt = cwnd / max_increment; + else if (dist <= 1U) + /* binary search increase */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else + /* binary search increase */ + ca->cnt = cwnd / dist; + } else { + /* slow start AMD linear increase */ + if (cwnd < ca->last_max_cwnd + BICTCP_B) + /* slow start */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) + /* slow start */ + ca->cnt = (cwnd * (BICTCP_B-1)) + / (cwnd - ca->last_max_cwnd); + else + /* linear increase */ + ca->cnt = cwnd / max_increment; + } + + /* if in slow start or link utilization is very low */ + if (ca->loss_cwnd == 0) { + if (ca->cnt > 20) /* increase cwnd 5% per RTT */ + ca->cnt = 20; + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + +static void bictcp_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int data_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + bictcp_update(ca, tp->snd_cwnd); + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= ca->cnt) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + +} + +/* + * behave like Reno until low_window is reached, + * then increase congestion window slowly + */ +static u32 bictcp_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + ca->epoch_start = 0; /* end of epoch */ + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + bbeta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + + if (tp->snd_cwnd <= low_window) + return max(tp->snd_cwnd >> 1U, 2U); + else + return max((tp->snd_cwnd * bbeta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct bictcp *ca = inet_csk_ca(sk); + return max(tp->snd_cwnd, ca->last_max_cwnd); +} + +static u32 bictcp_min_cwnd(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return tp->snd_ssthresh; +} + +static void bictcp_state(struct sock *sk, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + bictcp_reset(inet_csk_ca(sk)); +} + +/* Track delayed acknowledgment ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct sock *sk, u32 cnt) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + struct bictcp *ca = inet_csk_ca(sk); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops bictcp = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .min_cwnd = bictcp_min_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "bic", +}; + +static int __init bictcp_register(void) +{ + BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&bictcp); +} + +static void __exit bictcp_unregister(void) +{ + tcp_unregister_congestion_control(&bictcp); +} + +module_init(bictcp_register); +module_exit(bictcp_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("BIC TCP"); diff -urN ns-2.31-orig/tcp/linux/tcp_compound.c ns-2.31-linux/tcp/linux/tcp_compound.c --- ns-2.31-orig/tcp/linux/tcp_compound.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_compound.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,470 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + * + * + * TCP Compound based on TCP Vegas + * + * further details can be found here: + * ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf + */ +/* +#include +#include +#include +#include +#include + +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 + +#define TCP_COMPOUND_ALPHA 3U +#define TCP_COMPOUND_BETA 1U +#define TCP_COMPOUND_GAMMA 30 +#define TCP_COMPOUND_ZETA 1 + +/* TCP compound variables */ +struct compound { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now; /* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ + + u32 cwnd; + u32 dwnd; +}; + +/* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the + * end of the first RTT, because any calculation we do is using + * stale info -- both the saved cwnd and congestion feedback are + * stale. + * + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ +static inline void cvegas_enable(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct compound *vegas = inet_csk_ca(sk); + + /* Begin taking Vegas samples next time we send something. */ + vegas->doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void cvegas_disable(struct sock *sk) +{ + struct compound *vegas = inet_csk_ca(sk); + + vegas->doing_vegas_now = 0; +} + +static void tcp_compound_init(struct sock *sk) +{ + struct compound *vegas = inet_csk_ca(sk); + const struct tcp_sock *tp = tcp_sk(sk); + + vegas->baseRTT = 0x7fffffff; + cvegas_enable(sk); + + vegas->dwnd = 0; + vegas->cwnd = tp->snd_cwnd; +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct compound *vegas = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} + +static void tcp_compound_state(struct sock *sk, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + cvegas_enable(sk); + else + cvegas_disable(sk); +} + + +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u64 cdiv64_64(u64 dividend, u64 divisor) +{ + u32 d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (u32) dividend / d; + + return dividend; +} + +/* calculate the quartic root of "a" using Newton-Raphson */ +static u32 qroot(u64 a) +{ + u32 x, x1; + + /* Initial estimate is based on: + * qrt(x) = exp(log(x) / 4) + */ + x = 1u << (fls64(a) >> 2); + + /* + * Iteration based on: + * 3 + * x = ( 3 * x + a / x ) / 4 + * k+1 k k + */ + do { + u64 x3 = x; + + x1 = x; + x3 *= x; + x3 *= x; + + x = (3 * x + (u32) cdiv64_64(a, x3)) / 4; + } while (abs(x1 - x) > 1); + + return x; +} + + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) + tcp_compound_init(sk); +} + +static void tcp_compound_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct compound *vegas = inet_csk_ca(sk); + u8 inc = 0; + + if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) { + + if (vegas->cwnd > tp->snd_cwnd || + vegas->dwnd > tp->snd_cwnd) { + + vegas->cwnd = tp->snd_cwnd; + vegas->dwnd = 0; + + } else + vegas->cwnd = tp->snd_cwnd - vegas->dwnd; + + } + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (vegas->cwnd <= tp->snd_ssthresh) + inc = 1; + else if (tp->snd_cwnd_cnt < tp->snd_cwnd) + tp->snd_cwnd_cnt++; + + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + inc = 1; + tp->snd_cwnd_cnt = 0; + } + + if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp) + vegas->cwnd++; + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + if (!tp->mss_cache) + return; + + old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = vegas->beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_una = vegas->beg_snd_nxt; + vegas->beg_snd_nxt = tp->snd_nxt; + vegas->beg_snd_cwnd = tp->snd_cwnd; + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT > 2) { + u32 rtt, target_cwnd, diff; + u32 brtt, dwnd; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + if (!rtt) + return; + + brtt = vegas->baseRTT; + target_cwnd = ((old_wnd * brtt) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + dwnd = vegas->dwnd; + + if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) { + u64 win3; + u64 x; + /* + * The TCP Compound paper describes the choice + * of "k" determines the agressiveness, + * ie. slope of the response function. + * + * For same value as HSTCP would be 0.8 + * but for computaional reasons, both the + * original authors and this implementation + * use 0.75. + */ + win3 = old_wnd; + win3 *= old_wnd; + win3 *= old_wnd; + x = qroot(win3) >> TCP_COMPOUND_ALPHA; + + if (x > 1) + dwnd = x - 1; + else + dwnd = 0; + + dwnd += vegas->dwnd; + + } else if ((dwnd << V_PARAM_SHIFT) < + (diff * TCP_COMPOUND_BETA)) + dwnd = 0; + else + dwnd = + ((dwnd << V_PARAM_SHIFT) - + (diff * + TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT; + + vegas->dwnd = dwnd; + + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + + tp->snd_cwnd = vegas->cwnd + vegas->dwnd; +} + + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct compound *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info *info; + + info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, + sizeof(*info))); + + info->tcpv_enabled = ca->doing_vegas_now; + info->tcpv_rttcnt = ca->cntRTT; + info->tcpv_rtt = ca->baseRTT; + info->tcpv_minrtt = ca->minRTT; + rtattr_failure:; + } +} + +static struct tcp_congestion_ops tcp_compound = { + .init = tcp_compound_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_compound_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_compound_rtt_calc, + .set_state = tcp_compound_state, + .cwnd_event = tcp_compound_cwnd_event, + .get_info = tcp_compound_get_info, + + .owner = THIS_MODULE, + .name = "compound", +}; + +static int __init tcp_compound_register(void) +{ + BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_compound); + return 0; +} + +static void __exit tcp_compound_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_compound); +} + +module_init(tcp_compound_register); +module_exit(tcp_compound_unregister); + +MODULE_AUTHOR("Angelo P. Castellani / Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Compound"); + + + + + + + + + + + + diff -urN ns-2.31-orig/tcp/linux/tcp_cong.c ns-2.31-linux/tcp/linux/tcp_cong.c --- ns-2.31-orig/tcp/linux/tcp_cong.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_cong.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,284 @@ +/* + * Plugable TCP congestion control support and newReno + * congestion control. + * Based on ideas from I/O scheduler suport and Web100. + * + * Copyright (C) 2005 Stephen Hemminger + */ +/* +#include +#include +#include +#include +#include +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" + +static DEFINE_SPINLOCK(tcp_cong_list_lock); +static LIST_HEAD(tcp_cong_list); + +/* Simple linear search, don't expect many entries! */ +static struct tcp_congestion_ops *tcp_ca_find(const char *name) +{ + struct tcp_congestion_ops *e; + + list_for_each_entry_rcu(e, &tcp_cong_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +/* + * Attach new congestion control algorthim to the list + * of available options. + */ +int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +{ + int ret = 0; + + /* all algorithms must implement ssthresh and cong_avoid ops */ + if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { + printk(KERN_ERR "TCP %s does not implement required ops\n", + ca->name); + return -EINVAL; + } + + spin_lock(&tcp_cong_list_lock); + if (tcp_ca_find(ca->name)) { + printk(KERN_NOTICE "TCP %s already registered\n", ca->name); + ret = -EEXIST; + } else { + list_add_rcu(&ca->list, &tcp_cong_list); + printk(KERN_INFO "TCP %s registered\n", ca->name); + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_register_congestion_control); + +/* + * Remove congestion control algorithm, called from + * the module's remove function. Module ref counts are used + * to ensure that this can't be done till all sockets using + * that method are closed. + */ +void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) +{ + spin_lock(&tcp_cong_list_lock); + list_del_rcu(&ca->list); + spin_unlock(&tcp_cong_list_lock); +} +EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); + +/* Assign choice of congestion control. */ +void tcp_init_congestion_control(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_congestion_ops *ca; + + if (icsk->icsk_ca_ops != &tcp_init_congestion_ops) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (try_module_get(ca->owner)) { + icsk->icsk_ca_ops = ca; + break; + } + + } + rcu_read_unlock(); + + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); +} + +/* Manage refcounts on socket close. */ +void tcp_cleanup_congestion_control(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + module_put(icsk->icsk_ca_ops->owner); +} + +/* Used by sysctl to change default congestion control */ +int tcp_set_default_congestion_control(const char *name) +{ + struct tcp_congestion_ops *ca; + int ret = -ENOENT; + + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); +#ifdef CONFIG_KMOD + if (!ca) { + spin_unlock(&tcp_cong_list_lock); + + request_module("tcp_%s", name); + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); + } +#endif + + if (ca) { + list_move(&ca->list, &tcp_cong_list); + ret = 0; + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} + +/* Get current default congestion control */ +void tcp_get_default_congestion_control(char *name) +{ + struct tcp_congestion_ops *ca; + /* We will always have reno... */ + BUG_ON(list_empty(&tcp_cong_list)); + + rcu_read_lock(); + ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + strncpy(name, ca->name, TCP_CA_NAME_MAX); + rcu_read_unlock(); +} + +/* Change congestion control for socket */ +int tcp_set_congestion_control(struct sock *sk, const char *name) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_congestion_ops *ca; + int err = 0; + + rcu_read_lock(); + ca = tcp_ca_find(name); + if (ca == icsk->icsk_ca_ops) + goto out; + + if (!ca) + err = -ENOENT; + + else if (!try_module_get(ca->owner)) + err = -EBUSY; + + else { + tcp_cleanup_congestion_control(sk); + icsk->icsk_ca_ops = ca; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + } + out: + rcu_read_unlock(); + return err; +} + + +/* + * Linear increase during slow start + */ +void tcp_slow_start(struct tcp_sock *tp) +{ + if (sysctl_tcp_abc) { + /* RFC3465: Slow Start + * TCP sender SHOULD increase cwnd by the number of + * previously unacknowledged bytes ACKed by each incoming + * acknowledgment, provided the increase is not more than L + */ + if (tp->bytes_acked < tp->mss_cache) + return; + + /* We MAY increase by 2 if discovered delayed ack */ + if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } + tp->bytes_acked = 0; + + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; +} +EXPORT_SYMBOL_GPL(tcp_slow_start); + +/* + * TCP Reno congestion control + * This is special case used for fallback as well. + */ +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* In "safe" area, increase. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + /* In dangerous area, increase slowly. */ + else if (sysctl_tcp_abc) { + /* RFC3465: Apppriate Byte Count + * increase once for each full cwnd acked + */ + if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { + tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { + /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } +} +EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); + +/* Slow start threshold is half the congestion window (min 2) */ +u32 tcp_reno_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return max(tp->snd_cwnd >> 1U, 2U); +} +EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); + +/* Lower bound on congestion window. */ +u32 tcp_reno_min_cwnd(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return tp->snd_ssthresh/2; +} +EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); + +struct tcp_congestion_ops tcp_reno = { + .name = "reno", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; + +/* Initial congestion control used (until SYN) + * really reno under another name so we can tell difference + * during tcp_set_default_congestion_control + */ +struct tcp_congestion_ops tcp_init_congestion_ops = { + .name = "", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; +EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff -urN ns-2.31-orig/tcp/linux/tcp_cubic.c ns-2.31-linux/tcp/linux/tcp_cubic.c --- ns-2.31-orig/tcp/linux/tcp_cubic.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_cubic.c 2007-07-18 15:46:33.000000000 -0700 @@ -0,0 +1,417 @@ +/* + * TCP CUBIC: Binary Increase Congestion control for TCP v2.0 + * + * This is from the implementation of CUBIC TCP in + * Injong Rhee, Lisong Xu. + * "CUBIC: A New TCP-Friendly High-Speed TCP Variant + * in PFLDnet 2005 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + * + * Unless CUBIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ +/* +#include +#include +#include +#include +#include +*/ + +#include "ns-linux-c.h" +#include "ns-linux-util.h" + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ +#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ + +static int cfast_convergence = 1; +static int cmax_increment = 16; +static int cbeta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int cinitial_ssthresh = 100; +static int cbic_scale = 41; +static int tcp_friendliness = 1; + +static u32 cube_rtt_scale; +static u32 beta_scale; +static u64 cube_factor; + +/* Note parameters that are used for precomputing scale factors are read-only */ +module_param(cfast_convergence, int, 0644); +MODULE_PARM_DESC(cfast_convergence, "turn on/off fast convergence"); +module_param(cmax_increment, int, 0644); +MODULE_PARM_DESC(cmax_increment, "Limit on increment allowed during binary search"); +module_param(cbeta, int, 0444); +MODULE_PARM_DESC(cbeta, "beta for multiplicative increase"); +module_param(cinitial_ssthresh, int, 0644); +MODULE_PARM_DESC(cinitial_ssthresh, "initial value of slow start threshold"); +module_param(cbic_scale, int, 0444); +MODULE_PARM_DESC(cbic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); +module_param(tcp_friendliness, int, 0644); +MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); + +/* +#include +*/ + +/* BIC TCP Parameters */ +struct cbictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 bic_origin_point;/* origin point of bic function */ + u32 bic_K; /* time to origin point from the beginning of the current epoch */ + u32 delay_min; /* min delay */ + u32 epoch_start; /* beginning of an epoch */ + u32 ack_cnt; /* number of acks */ + u32 tcp_cwnd; /* estimated tcp cwnd */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void cbictcp_reset(struct cbictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->bic_origin_point = 0; + ca->bic_K = 0; + ca->delay_min = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; + ca->ack_cnt = 0; + ca->tcp_cwnd = 0; +} + +static void cbictcp_init(struct sock *sk) +{ + cubictcp_register(); + cbictcp_reset(inet_csk_ca(sk)); + if (cinitial_ssthresh) + tcp_sk(sk)->snd_ssthresh = cinitial_ssthresh; +} + +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) +{ + u_int32_t d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (uint32_t) dividend / d; + + return dividend; +} + +/* + * calculate the cubic root of x using Newton-Raphson + */ +static u32 cubic_root(u64 a) +{ + u32 x, x1; + + /* Initial estimate is based on: + * cbrt(x) = exp(log(x) / 3) + */ + x = 1u << (fls64(a)/3); + + /* + * Iteration based on: + * 2 + * x = ( 2 * x + a / x ) / 3 + * k+1 k k + */ + do { + x1 = x; + x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; + } while (abs(x1 - x) > 1); + + return x; +} + +/* + * Compute congestion window to use. + */ +static inline void cbictcp_update(struct cbictcp *ca, u32 cwnd) +{ + u64 offs; + u32 delta, t, bic_target, min_cnt, max_cnt; + + ca->ack_cnt++; /* count the number of ACKs */ + + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) { + ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ + ca->ack_cnt = 1; /* start counting */ + ca->tcp_cwnd = cwnd; /* syn with cubic */ + + if (ca->last_max_cwnd <= cwnd) { + ca->bic_K = 0; + ca->bic_origin_point = cwnd; + } else { + /* Compute new K based on + * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) + */ + ca->bic_K = cubic_root(cube_factor + * (ca->last_max_cwnd - cwnd)); + ca->bic_origin_point = ca->last_max_cwnd; + } + } + /* cubic function - calc*/ + /* calculate c * time^3 / rtt, + * while considering overflow in calculation of time^3 + * (so time^3 is done by using 64 bit) + * and without the support of division of 64bit numbers + * (so all divisions are done by using 32 bit) + * also NOTE the unit of those veriables + * time = (t - K) / 2^bictcp_HZ + * c = bic_scale >> 10 + * rtt = (srtt >> 3) / HZ + * !!! The following code does not have overflow problems, + * if the cwnd < 1 million packets !!! + */ + + /* change the unit from HZ to cbictcp_HZ */ + t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) + << BICTCP_HZ) / HZ; + + if (t < ca->bic_K) /* t - K */ + offs = ca->bic_K - t; + else + offs = t - ca->bic_K; + + /* c/rtt * (t-K)^3 */ + delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); + if (t < ca->bic_K) /* below origin*/ + bic_target = ca->bic_origin_point - delta; + else /* above origin*/ + bic_target = ca->bic_origin_point + delta; + + /* cubic function - calc cbictcp_cnt*/ + if (bic_target > cwnd) { + ca->cnt = cwnd / (bic_target - cwnd); + } else { + ca->cnt = 100 * cwnd; /* very small increment*/ + } + + if (ca->delay_min > 0) { + /* max increment = Smax * rtt / 0.1 */ + min_cnt = (cwnd * HZ * 8)/(10 * cmax_increment * ca->delay_min); + if (ca->cnt < min_cnt) + ca->cnt = min_cnt; + } + + /* slow start and low utilization */ + if (ca->loss_cwnd == 0) /* could be aggressive in slow start */ + ca->cnt = 50; + + /* TCP Friendly */ + if (tcp_friendliness) { + u32 scale = beta_scale; + delta = (cwnd * scale) >> 3; + while (ca->ack_cnt > delta) { /* update tcp cwnd */ + ca->ack_cnt -= delta; + ca->tcp_cwnd++; + } + + if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ + delta = ca->tcp_cwnd - cwnd; + max_cnt = cwnd / delta; + if (ca->cnt > max_cnt) + ca->cnt = max_cnt; + } + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + + +/* Keep track of minimum rtt */ +static inline void measure_delay(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct cbictcp *ca = inet_csk_ca(sk); + u32 delay; + + /* No time stamp */ + if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || + /* Discard delay samples right after fast recovery */ + (s32)(tcp_time_stamp - ca->epoch_start) < HZ) + return; + + delay = (tcp_time_stamp - tp->rx_opt.rcv_tsecr)<<3; + if (delay == 0) + delay = 1; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; +} + +static void cbictcp_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int data_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct cbictcp *ca = inet_csk_ca(sk); + + if (data_acked) + measure_delay(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + cbictcp_update(ca, tp->snd_cwnd); + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= ca->cnt) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + +} + +static u32 cbictcp_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct cbictcp *ca = inet_csk_ca(sk); + + ca->epoch_start = 0; /* end of epoch */ + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && cfast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + cbeta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + return max((tp->snd_cwnd * cbeta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 cbictcp_undo_cwnd(struct sock *sk) +{ + struct cbictcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); +} + +static u32 cbictcp_min_cwnd(struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +static void cbictcp_state(struct sock *sk, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + cbictcp_reset(inet_csk_ca(sk)); +} + +/* Track delayed acknowledgment ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void cbictcp_acked(struct sock *sk, u32 cnt) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + struct cbictcp *ca = inet_csk_ca(sk); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops cubictcp = { + .init = cbictcp_init, + .ssthresh = cbictcp_recalc_ssthresh, + .cong_avoid = cbictcp_cong_avoid, + .set_state = cbictcp_state, + .undo_cwnd = cbictcp_undo_cwnd, + .min_cwnd = cbictcp_min_cwnd, + .pkts_acked = cbictcp_acked, + .owner = THIS_MODULE, + .name = "cubic", +}; + +static int __init cubictcp_register(void) +{ + BUG_ON(sizeof(struct cbictcp) > ICSK_CA_PRIV_SIZE); + + /* Precompute a bunch of the scaling factors that are used per-packet + * based on SRTT of 100ms + */ + + beta_scale = 8*(BICTCP_BETA_SCALE+cbeta)/ 3 / (BICTCP_BETA_SCALE - cbeta); + + cube_rtt_scale = cbic_scale * 10; /* 1024*c/rtt */ + + /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 + * so K = cubic_root( (wmax-cwnd)*rtt/c ) + * the unit of K is cbictcp_HZ=2^10, not HZ + * + * c = cbic_scale >> 10 + * rtt = 100ms + * + * the following code has been designed and tested for + * cwnd < 1 million packets + * RTT < 100 seconds + * HZ < 1,000,00 (corresponding to 10 nano-second) + */ + + /* 1/c * 2^2*bictcp_HZ * srtt */ + cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ + + /* divide by cbic_scale and by constant Srtt (100ms) */ + do_div(cube_factor, cbic_scale * 10); + + return tcp_register_congestion_control(&cubictcp); +} + +static void __exit cubictcp_unregister(void) +{ + tcp_unregister_congestion_control(&cubictcp); +} + +module_init(cubictcp_register); +module_exit(cubictcp_unregister); + +MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("CUBIC TCP"); +MODULE_VERSION("2.0"); diff -urN ns-2.31-orig/tcp/linux/tcp_highspeed.c ns-2.31-linux/tcp/linux/tcp_highspeed.c --- ns-2.31-orig/tcp/linux/tcp_highspeed.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_highspeed.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,185 @@ +/* + * Sally Floyd's High Speed TCP (RFC 3649) congestion control + * + * See http://www.icir.org/floyd/hstcp.html + * + * John Heffner + */ + +/* +#include +#include +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" +/* From AIMD tables from RFC 3649 appendix B, + * with fixed-point MD scaled <<8. + */ +static const struct hstcp_aimd_val { + unsigned int cwnd; + unsigned int md; +} hstcp_aimd_vals[] = { + { 38, 128, /* 0.50 */ }, + { 118, 112, /* 0.44 */ }, + { 221, 104, /* 0.41 */ }, + { 347, 98, /* 0.38 */ }, + { 495, 93, /* 0.37 */ }, + { 663, 89, /* 0.35 */ }, + { 851, 86, /* 0.34 */ }, + { 1058, 83, /* 0.33 */ }, + { 1284, 81, /* 0.32 */ }, + { 1529, 78, /* 0.31 */ }, + { 1793, 76, /* 0.30 */ }, + { 2076, 74, /* 0.29 */ }, + { 2378, 72, /* 0.28 */ }, + { 2699, 71, /* 0.28 */ }, + { 3039, 69, /* 0.27 */ }, + { 3399, 68, /* 0.27 */ }, + { 3778, 66, /* 0.26 */ }, + { 4177, 65, /* 0.26 */ }, + { 4596, 64, /* 0.25 */ }, + { 5036, 62, /* 0.25 */ }, + { 5497, 61, /* 0.24 */ }, + { 5979, 60, /* 0.24 */ }, + { 6483, 59, /* 0.23 */ }, + { 7009, 58, /* 0.23 */ }, + { 7558, 57, /* 0.22 */ }, + { 8130, 56, /* 0.22 */ }, + { 8726, 55, /* 0.22 */ }, + { 9346, 54, /* 0.21 */ }, + { 9991, 53, /* 0.21 */ }, + { 10661, 52, /* 0.21 */ }, + { 11358, 52, /* 0.20 */ }, + { 12082, 51, /* 0.20 */ }, + { 12834, 50, /* 0.20 */ }, + { 13614, 49, /* 0.19 */ }, + { 14424, 48, /* 0.19 */ }, + { 15265, 48, /* 0.19 */ }, + { 16137, 47, /* 0.19 */ }, + { 17042, 46, /* 0.18 */ }, + { 17981, 45, /* 0.18 */ }, + { 18955, 45, /* 0.18 */ }, + { 19965, 44, /* 0.17 */ }, + { 21013, 43, /* 0.17 */ }, + { 22101, 43, /* 0.17 */ }, + { 23230, 42, /* 0.17 */ }, + { 24402, 41, /* 0.16 */ }, + { 25618, 41, /* 0.16 */ }, + { 26881, 40, /* 0.16 */ }, + { 28193, 39, /* 0.16 */ }, + { 29557, 39, /* 0.15 */ }, + { 30975, 38, /* 0.15 */ }, + { 32450, 38, /* 0.15 */ }, + { 33986, 37, /* 0.15 */ }, + { 35586, 36, /* 0.14 */ }, + { 37253, 36, /* 0.14 */ }, + { 38992, 35, /* 0.14 */ }, + { 40808, 35, /* 0.14 */ }, + { 42707, 34, /* 0.13 */ }, + { 44694, 33, /* 0.13 */ }, + { 46776, 33, /* 0.13 */ }, + { 48961, 32, /* 0.13 */ }, + { 51258, 32, /* 0.13 */ }, + { 53677, 31, /* 0.12 */ }, + { 56230, 30, /* 0.12 */ }, + { 58932, 30, /* 0.12 */ }, + { 61799, 29, /* 0.12 */ }, + { 64851, 28, /* 0.11 */ }, + { 68113, 28, /* 0.11 */ }, + { 71617, 27, /* 0.11 */ }, + { 75401, 26, /* 0.10 */ }, + { 79517, 26, /* 0.10 */ }, + { 84035, 25, /* 0.10 */ }, + { 89053, 24, /* 0.10 */ }, +}; + +#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) + +struct hstcp { + u32 ai; +}; + +static void hstcp_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct hstcp *ca = inet_csk_ca(sk); + + ca->ai = 0; + + /* Ensure the MD arithmetic works. This is somewhat pedantic, + * since I don't think we will see a cwnd this large. :) */ + tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); +} + +static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, + u32 in_flight, int data_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct hstcp *ca = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + /* Update AIMD parameters */ + if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai < HSTCP_AIMD_MAX - 1) + ca->ai++; + } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai > 0) + ca->ai--; + } + + /* Do additive increase */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd_cnt += ca->ai; + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt -= tp->snd_cwnd; + } + } + } +} + +static u32 hstcp_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct hstcp *ca = inet_csk_ca(sk); + + /* Do multiplicative decrease */ + return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); +} + + +static struct tcp_congestion_ops tcp_highspeed = { + .init = hstcp_init, + .ssthresh = hstcp_ssthresh, + .cong_avoid = hstcp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "highspeed" +}; + +static int __init hstcp_register(void) +{ + BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_highspeed); +} + +static void __exit hstcp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_highspeed); +} + +module_init(hstcp_register); +module_exit(hstcp_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("High Speed TCP"); diff -urN ns-2.31-orig/tcp/linux/tcp_htcp.c ns-2.31-linux/tcp/linux/tcp_htcp.c --- ns-2.31-orig/tcp/linux/tcp_htcp.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_htcp.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,299 @@ +/* + * H-TCP congestion control. The algorithm is detailed in: + * R.N.Shorten, D.J.Leith: + * "H-TCP: TCP for high-speed and long-distance networks" + * Proc. PFLDnet, Argonne, 2004. + * http://www.hamilton.ie/net/htcp3.pdf + */ +/* +#include +#include +#include +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" + +#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ +#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ +#define BETA_MAX 102 /* 0.8 with shift << 7 */ + +static int use_rtt_scaling = 1; +module_param(use_rtt_scaling, int, 0644); +MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling"); + +static int use_bandwidth_switch = 1; +module_param(use_bandwidth_switch, int, 0644); +MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher"); + +struct htcp { + u16 alpha; /* Fixed point arith, << 7 */ + u8 beta; /* Fixed point arith, << 7 */ + u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */ + u8 ccount; /* Number of RTTs since last congestion event */ + u8 undo_ccount; + u16 packetcount; + u32 minRTT; + u32 maxRTT; + u32 snd_cwnd_cnt2; + + u32 undo_maxRTT; + u32 undo_old_maxB; + + /* Bandwidth estimation */ + u32 minB; + u32 maxB; + u32 old_maxB; + u32 Bi; + u32 lasttime; +}; + +static inline void htcp_reset(struct htcp *ca) +{ + ca->undo_ccount = ca->ccount; + ca->undo_maxRTT = ca->maxRTT; + ca->undo_old_maxB = ca->old_maxB; + + ca->ccount = 0; + ca->snd_cwnd_cnt2 = 0; +} + +static u32 htcp_cwnd_undo(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); + ca->ccount = ca->undo_ccount; + ca->maxRTT = ca->undo_maxRTT; + ca->old_maxB = ca->undo_old_maxB; + return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); +} + +static inline void measure_rtt(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); + u32 srtt = tp->srtt>>3; + + /* keep track of minimum RTT seen so far, minRTT is zero at first */ + if (ca->minRTT > srtt || !ca->minRTT) + ca->minRTT = srtt; + + /* max RTT */ + if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { + if (ca->maxRTT < ca->minRTT) + ca->maxRTT = ca->minRTT; + if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) + ca->maxRTT = srtt; + } +} + +static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); + u32 now = tcp_time_stamp; + + /* achieved throughput calculations */ + if (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_Disorder) { + ca->packetcount = 0; + ca->lasttime = now; + return; + } + + ca->packetcount += pkts_acked; + + if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1) + && now - ca->lasttime >= ca->minRTT + && ca->minRTT > 0) { + __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime); + if (ca->ccount <= 3) { + /* just after backoff */ + ca->minB = ca->maxB = ca->Bi = cur_Bi; + } else { + ca->Bi = (3*ca->Bi + cur_Bi)/4; + if (ca->Bi > ca->maxB) + ca->maxB = ca->Bi; + if (ca->minB > ca->maxB) + ca->minB = ca->maxB; + } + ca->packetcount = 0; + ca->lasttime = now; + } +} + +static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) +{ + if (use_bandwidth_switch) { + u32 maxB = ca->maxB; + u32 old_maxB = ca->old_maxB; + ca->old_maxB = ca->maxB; + + if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) { + ca->beta = BETA_MIN; + ca->modeswitch = 0; + return; + } + } + + if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) { + ca->beta = (minRTT<<7)/maxRTT; + if (ca->beta < BETA_MIN) + ca->beta = BETA_MIN; + else if (ca->beta > BETA_MAX) + ca->beta = BETA_MAX; + } else { + ca->beta = BETA_MIN; + ca->modeswitch = 1; + } +} + +static inline void htcp_alpha_update(struct htcp *ca) +{ + u32 minRTT = ca->minRTT; + u32 factor = 1; + u32 diff = ca->ccount * minRTT; /* time since last backoff */ + + if (diff > HZ) { + diff -= HZ; + factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ; + } + + if (use_rtt_scaling && minRTT) { + u32 scale = (HZ<<3)/(10*minRTT); + scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */ + factor = (factor<<3)/scale; + if (!factor) + factor = 1; + } + + ca->alpha = 2*factor*((1<<7)-ca->beta); + if (!ca->alpha) + ca->alpha = ALPHA_BASE; +} + +/* After we have the rtt data to calculate beta, we'd still prefer to wait one + * rtt before we adjust our beta to ensure we are working from a consistent + * data. + * + * This function should be called when we hit a congestion event since only at + * that point do we really have a real sense of maxRTT (the queues en route + * were getting just too full now). + */ +static void htcp_param_update(struct sock *sk) +{ + struct htcp *ca = inet_csk_ca(sk); + u32 minRTT = ca->minRTT; + u32 maxRTT = ca->maxRTT; + + htcp_beta_update(ca, minRTT, maxRTT); + htcp_alpha_update(ca); + + /* add slowly fading memory for maxRTT to accommodate routing changes etc */ + if (minRTT > 0 && maxRTT > minRTT) + ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; +} + +static u32 htcp_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct htcp *ca = inet_csk_ca(sk); + htcp_param_update(sk); + return max((tp->snd_cwnd * ca->beta) >> 7, 2U); +} + +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, + u32 in_flight, int data_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + + measure_rtt(sk); + + /* keep track of number of round-trip times since last backoff event */ + if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { + ca->ccount++; + ca->snd_cwnd_cnt2 = 0; + htcp_alpha_update(ca); + } + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd + */ + if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } +} + +/* Lower bound on congestion window. */ +static u32 htcp_min_cwnd(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return tp->snd_ssthresh; +} + + +static void htcp_init(struct sock *sk) +{ + struct htcp *ca = inet_csk_ca(sk); + + memset(ca, 0, sizeof(struct htcp)); + ca->alpha = ALPHA_BASE; + ca->beta = BETA_MIN; +} + +static void htcp_state(struct sock *sk, u8 new_state) +{ + switch (new_state) { + case TCP_CA_CWR: + case TCP_CA_Recovery: + case TCP_CA_Loss: + htcp_reset(inet_csk_ca(sk)); + break; + } +} + +static struct tcp_congestion_ops htcp = { + .init = htcp_init, + .ssthresh = htcp_recalc_ssthresh, + .min_cwnd = htcp_min_cwnd, + .cong_avoid = htcp_cong_avoid, + .set_state = htcp_state, + .undo_cwnd = htcp_cwnd_undo, + .pkts_acked = measure_achieved_throughput, + .owner = THIS_MODULE, + .name = "htcp", +}; + +static int __init htcp_register(void) +{ + BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE); + BUILD_BUG_ON(BETA_MIN >= BETA_MAX); + if (!use_bandwidth_switch) + htcp.pkts_acked = NULL; + return tcp_register_congestion_control(&htcp); +} + +static void __exit htcp_unregister(void) +{ + tcp_unregister_congestion_control(&htcp); +} + +module_init(htcp_register); +module_exit(htcp_unregister); + +MODULE_AUTHOR("Baruch Even"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("H-TCP"); diff -urN ns-2.31-orig/tcp/linux/tcp_hybla.c ns-2.31-linux/tcp/linux/tcp_hybla.c --- ns-2.31-orig/tcp/linux/tcp_hybla.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_hybla.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,191 @@ +/* + * TCP HYBLA + * + * TCP-HYBLA Congestion control algorithm, based on: + * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement + * for Heterogeneous Networks", + * International Journal on satellite Communications, + * September 2004 + * Daniele Lacamera + * root at danielinux.net + */ +/* +#include +#include +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" + +/* Tcp Hybla structure. */ +struct hybla { + u8 hybla_en; + u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ + u32 rho; /* Rho parameter, integer part */ + u32 rho2; /* Rho * Rho, integer part */ + u32 rho_3ls; /* Rho parameter, <<3 */ + u32 rho2_7ls; /* Rho^2, <<7 */ + u32 minrtt; /* Minimum smoothed round trip time value seen */ +}; + +/* Hybla reference round trip time (default= 1/40 sec = 25 ms), + expressed in jiffies */ +static int rtt0 = 25; +module_param(rtt0, int, 0644); +MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); + + +/* This is called to refresh values for hybla parameters */ +static inline void hybla_recalc_param (struct sock *sk) +{ + struct hybla *ca = inet_csk_ca(sk); + + ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho = ca->rho_3ls >> 3; + ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; + ca->rho2 = ca->rho2_7ls >>7; +} + +static void hybla_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); + + ca->rho = 0; + ca->rho2 = 0; + ca->rho_3ls = 0; + ca->rho2_7ls = 0; + ca->snd_cwnd_cents = 0; + ca->hybla_en = 1; + tp->snd_cwnd = 2; + tp->snd_cwnd_clamp = 65535; + + /* 1st Rho measurement based on initial srtt */ + hybla_recalc_param(sk); + + /* set minimum rtt as this is the 1st ever seen */ + ca->minrtt = tp->srtt; + tp->snd_cwnd = ca->rho; +} + +static void hybla_state(struct sock *sk, u8 ca_state) +{ + struct hybla *ca = inet_csk_ca(sk); + ca->hybla_en = (ca_state == TCP_CA_Open); +} + +static inline u32 hybla_fraction(u32 odds) +{ + static const u32 fractions[] = { + 128, 139, 152, 165, 181, 197, 215, 234, + }; + + return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; +} + +/* TCP Hybla main routine. + * This is the algorithm behavior: + * o Recalc Hybla parameters if min_rtt has changed + * o Give cwnd a new value based on the model proposed + * o remember increments <1 + */ +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); + u32 increment, odd, rho_fractions; + int is_slowstart = 0; + + /* Recalculate rho only if this srtt is the lowest */ + if (tp->srtt < ca->minrtt){ + hybla_recalc_param(sk); + ca->minrtt = tp->srtt; + } + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (!ca->hybla_en) + return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); + + if (ca->rho == 0) + hybla_recalc_param(sk); + + rho_fractions = ca->rho_3ls - (ca->rho << 3); + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* + * slow start + * INC = 2^RHO - 1 + * This is done by splitting the rho parameter + * into 2 parts: an integer part and a fraction part. + * Inrement<<7 is estimated by doing: + * [2^(int+fract)]<<7 + * that is equal to: + * (2^int) * [(2^fract) <<7] + * 2^int is straightly computed as 1<rho) * hybla_fraction(rho_fractions)) + - 128; + } else { + /* + * congestion avoidance + * INC = RHO^2 / W + * as long as increment is estimated as (rho<<7)/window + * it already is <<7 and we can easily count its fractions. + */ + increment = ca->rho2_7ls / tp->snd_cwnd; + if (increment < 128) + tp->snd_cwnd_cnt++; + } + + odd = increment % 128; + tp->snd_cwnd += increment >> 7; + ca->snd_cwnd_cents += odd; + + /* check when fractions goes >=128 and increase cwnd by 1. */ + while(ca->snd_cwnd_cents >= 128) { + tp->snd_cwnd++; + ca->snd_cwnd_cents -= 128; + tp->snd_cwnd_cnt = 0; + } + + /* clamp down slowstart cwnd to ssthresh value. */ + if (is_slowstart) + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); +} + +static struct tcp_congestion_ops tcp_hybla = { + .init = hybla_init, + .ssthresh = tcp_reno_ssthresh, + .min_cwnd = tcp_reno_min_cwnd, + .cong_avoid = hybla_cong_avoid, + .set_state = hybla_state, + + .owner = THIS_MODULE, + .name = "hybla" +}; + +static int __init hybla_register(void) +{ + BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_hybla); +} + +static void __exit hybla_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_hybla); +} + +module_init(hybla_register); +module_exit(hybla_unregister); + +MODULE_AUTHOR("Daniele Lacamera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Hybla"); diff -urN ns-2.31-orig/tcp/linux/tcp_lp.c ns-2.31-linux/tcp/linux/tcp_lp.c --- ns-2.31-orig/tcp/linux/tcp_lp.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_lp.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,348 @@ +/* + * TCP Low Priority (TCP-LP) + * + * TCP Low Priority is a distributed algorithm whose goal is to utilize only + * the excess network bandwidth as compared to the ``fair share`` of + * bandwidth as targeted by TCP. Available from: + * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf + * + * Original Author: + * Aleksandar Kuzmanovic + * + * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation. + * As of 2.6.13, Linux supports pluggable congestion control algorithms. + * Due to the limitation of the API, we take the following changes from + * the original TCP-LP implementation: + * o We use newReno in most core CA handling. Only add some checking + * within cong_avoid. + * o Error correcting in remote HZ, therefore remote HZ will be keeped + * on checking and updating. + * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne + * OWD have a similar meaning as RTT. Also correct the buggy formular. + * o Handle reaction for Early Congestion Indication (ECI) within + * pkts_acked, as mentioned within pseudo code. + * o OWD is handled in relative format, where local time stamp will in + * tcp_time_stamp format. + * + * Port from 2.4.19 to 2.6.16 as module by: + * Wong Hoi Sing Edison + * Hung Hing Lun + * + * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $ + */ + +/* +#include +#include +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" + +#ifndef CONFIG_TCP_CONG_LP_DEBUG +#define CONFIG_TCP_CONG_LP_DEBUG 0 +#endif + +/* resolution of owd */ +#define LP_RESOL 1000 + +/** + * enum tcp_lp_state + * @LP_VALID_RHZ: is remote HZ valid? + * @LP_VALID_OWD: is OWD valid? + * @LP_WITHIN_THR: are we within threshold? + * @LP_WITHIN_INF: are we within inference? + * + * TCP-LP's state flags. + * We create this set of state flag mainly for debugging. + */ +enum tcp_lp_state { + LP_VALID_RHZ = (1 << 0), + LP_VALID_OWD = (1 << 1), + LP_WITHIN_THR = (1 << 3), + LP_WITHIN_INF = (1 << 4), +}; + +/** + * struct lp + * @flag: TCP-LP state flag + * @sowd: smoothed OWD << 3 + * @owd_min: min OWD + * @owd_max: max OWD + * @owd_max_rsv: resrved max owd + * @remote_hz: estimated remote HZ + * @remote_ref_time: remote reference time + * @local_ref_time: local reference time + * @last_drop: time for last active drop + * @inference: current inference + * + * TCP-LP's private struct. + * We get the idea from original TCP-LP implementation where only left those we + * found are really useful. + */ +struct lp { + u32 flag; + u32 sowd; + u32 owd_min; + u32 owd_max; + u32 owd_max_rsv; + u32 remote_hz; + u32 remote_ref_time; + u32 local_ref_time; + u32 last_drop; + u32 inference; +}; + +/** + * tcp_lp_init + * + * Init all required variables. + * Clone the handling from Vegas module implementation. + */ +static void tcp_lp_init(struct sock *sk) +{ + struct lp *lp = inet_csk_ca(sk); + + lp->flag = 0; + lp->sowd = 0; + lp->owd_min = 0xffffffff; + lp->owd_max = 0; + lp->owd_max_rsv = 0; + lp->remote_hz = 0; + lp->remote_ref_time = 0; + lp->local_ref_time = 0; + lp->last_drop = 0; + lp->inference = 0; +} + +/** + * tcp_lp_cong_avoid + * + * Implementation of cong_avoid. + * Will only call newReno CA when away from inference. + * From TCP-LP's paper, this will be handled in additive increasement. + */ +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + struct lp *lp = inet_csk_ca(sk); + + if (!(lp->flag & LP_WITHIN_INF)) + tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); +} + +/** + * tcp_lp_remote_hz_estimator + * + * Estimate remote HZ. + * We keep on updating the estimated value, where original TCP-LP + * implementation only guest it for once and use forever. + */ +static inline u32 tcp_lp_remote_hz_estimator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */ + s64 m = 0; + + /* not yet record reference time + * go away!! record it before come back!! */ + if (lp->remote_ref_time == 0 || lp->local_ref_time == 0) + goto out; + + /* we can't calc remote HZ with no different!! */ + if (tp->rx_opt.rcv_tsval == lp->remote_ref_time + || tp->rx_opt.rcv_tsecr == lp->local_ref_time) + goto out; + + m = HZ * (tp->rx_opt.rcv_tsval - + lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - + lp->local_ref_time); + if (m < 0) + m = -m; + + if (rhz != 0) { + m -= (rhz >> 6); /* m is now error in remote HZ est */ + rhz += m; /* 63/64 old + 1/64 new */ + } else + rhz = m << 6; + + /* record time for successful remote HZ calc */ + lp->flag |= LP_VALID_RHZ; + + out: + /* record reference time stamp */ + lp->remote_ref_time = tp->rx_opt.rcv_tsval; + lp->local_ref_time = tp->rx_opt.rcv_tsecr; + + return rhz >> 6; +} + +/** + * tcp_lp_owd_calculator + * + * Calculate one way delay (in relative format). + * Original implement OWD as minus of remote time difference to local time + * difference directly. As this time difference just simply equal to RTT, when + * the network status is stable, remote RTT will equal to local RTT, and result + * OWD into zero. + * It seems to be a bug and so we fixed it. + */ +static inline u32 tcp_lp_owd_calculator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 owd = 0; + + lp->remote_hz = tcp_lp_remote_hz_estimator(sk); + + if (lp->flag & LP_VALID_RHZ) { + owd = + tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - + tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); + if (owd < 0) + owd = -owd; + } + + if (owd > 0) + lp->flag |= LP_VALID_OWD; + else + lp->flag &= ~LP_VALID_OWD; + + return owd; +} + +/** + * tcp_lp_rtt_sample + * + * Implementation or rtt_sample. + * Will take the following action, + * 1. calc OWD, + * 2. record the min/max OWD, + * 3. calc smoothed OWD (SOWD). + * Most ideas come from the original TCP-LP implementation. + */ +static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) +{ + struct lp *lp = inet_csk_ca(sk); + s64 mowd = tcp_lp_owd_calculator(sk); + + /* sorry that we don't have valid data */ + if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD)) + return; + + /* record the next min owd */ + if (mowd < lp->owd_min) + lp->owd_min = mowd; + + /* always forget the max of the max + * we just set owd_max as one below it */ + if (mowd > lp->owd_max) { + if (mowd > lp->owd_max_rsv) { + if (lp->owd_max_rsv == 0) + lp->owd_max = mowd; + else + lp->owd_max = lp->owd_max_rsv; + lp->owd_max_rsv = mowd; + } else + lp->owd_max = mowd; + } + + /* calc for smoothed owd */ + if (lp->sowd != 0) { + mowd -= (lp->sowd >> 3); /* m is now error in owd est */ + lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */ + } else + lp->sowd = mowd << 3; /* take the measured time be owd */ +} + +/** + * tcp_lp_pkts_acked + * + * Implementation of pkts_acked. + * Deal with active drop under Early Congestion Indication. + * Only drop to half and 1 will be handle, because we hope to use back + * newReno in increase case. + * We work it out by following the idea from TCP-LP's paper directly + */ +static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + + /* calc inference */ + if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) + lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); + + /* test if within inference */ + if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) + lp->flag |= LP_WITHIN_INF; + else + lp->flag &= ~LP_WITHIN_INF; + + /* test if within threshold */ + if (lp->sowd >> 3 < + lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100) + lp->flag |= LP_WITHIN_THR; + else + lp->flag &= ~LP_WITHIN_THR; + +#if CONFIG_TCP_CONG_LP_DEBUG == 1 + printk(KERN_DEBUG "TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, + tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max, +p->sowd >> 3); +#endif + + if (lp->flag & LP_WITHIN_THR) + return; + + /* FIXME: try to reset owd_min and owd_max here + * so decrease the chance the min/max is no longer suitable + * and will usually within threshold when whithin inference */ + lp->owd_min = (lp->sowd >> 3); + lp->owd_max = (lp->sowd >> 2); + lp->owd_max_rsv = (lp->sowd >> 2); + + /* happened within inference + * drop snd_cwnd into 1 */ + if (lp->flag & LP_WITHIN_INF) + tp->snd_cwnd = 1U; + + /* happened after inference + * cut snd_cwnd into half */ + else + tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); + + /* record this drop time */ + lp->last_drop = tcp_time_stamp; +} + +static struct tcp_congestion_ops tcp_lp = { + .init = tcp_lp_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_lp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_lp_rtt_sample, + .pkts_acked = tcp_lp_pkts_acked, + + .owner = THIS_MODULE, + .name = "lp" +}; + +static int __init lp_register(void) +{ + BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_lp); +} + +static void __exit lp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_lp); +} + +module_init(lp_register); +module_exit(lp_unregister); + +MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Low Priority"); diff -urN ns-2.31-orig/tcp/linux/tcp_naivereno.c ns-2.31-linux/tcp/linux/tcp_naivereno.c --- ns-2.31-orig/tcp/linux/tcp_naivereno.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_naivereno.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,45 @@ +/* This is a very naive Reno implementation, shown as an example on how to develop a new congestion control algorithm with TCP-Linux. + * + * See a mini-tutorial about TCP-Linux at: http://www.cs.caltech.edu/~weixl/ns2.html + * + */ + + +#include "ns-linux-c.h" +#include "ns-linux-util.h" + +/* opencwnd */ +void tcp_naive_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, int flag) +{ + if (tp->snd_cwnd < tp->snd_ssthresh) { + tp->snd_cwnd++; + } else { + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else { + tp->snd_cwnd_cnt++; + } + } +} + +/* ssthreshold should be half of the congestion window after a loss */ +u32 tcp_naive_reno_ssthresh(struct tcp_sock *tp) +{ + return max(tp->snd_cwnd >> 1U, 2U); +} + + +/* congestion window should be equal to the slow start threshold (after slow start threshold set to half of cwnd before loss). */ +u32 tcp_naive_reno_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh; +} + +struct tcp_congestion_ops naive_reno = { + .name = "naive_reno", + .ssthresh = tcp_naive_reno_ssthresh, + .cong_avoid = tcp_naive_reno_cong_avoid, + .min_cwnd = tcp_naive_reno_min_cwnd +}; diff -urN ns-2.31-orig/tcp/linux/tcp_scalable.c ns-2.31-linux/tcp/linux/tcp_scalable.c --- ns-2.31-orig/tcp/linux/tcp_scalable.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_scalable.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,73 @@ +/* Tom Kelly's Scalable TCP + * + * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/ + * + * John Heffner + */ +/* +#include +#include +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" + +/* These factors derived from the recommended values in the aer: + * .01 and and 7/8. We use 50 instead of 100 to account for + * delayed ack. + */ +#define TCP_SCALABLE_AI_CNT 50U +#define TCP_SCALABLE_MD_SCALE 3 + +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + tp->snd_cwnd_cnt++; + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } +} + +static u32 tcp_scalable_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); +} + + +static struct tcp_congestion_ops tcp_scalable = { + .ssthresh = tcp_scalable_ssthresh, + .cong_avoid = tcp_scalable_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "scalable", +}; + +static int __init tcp_scalable_register(void) +{ + return tcp_register_congestion_control(&tcp_scalable); +} + +static void __exit tcp_scalable_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_scalable); +} + +module_init(tcp_scalable_register); +module_exit(tcp_scalable_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Scalable TCP"); diff -urN ns-2.31-orig/tcp/linux/tcp_vegas.c ns-2.31-linux/tcp/linux/tcp_vegas.c --- ns-2.31-orig/tcp/linux/tcp_vegas.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_vegas.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,391 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ +/* +#include +#include +#include +#include +#include + +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static int valpha = 1<doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void vegas_disable(struct sock *sk) +{ + struct vegas *vegas = inet_csk_ca(sk); + + vegas->doing_vegas_now = 0; +} + +static void tcp_vegas_init(struct sock *sk) +{ + struct vegas *vegas = inet_csk_ca(sk); + + vegas->baseRTT = 0x7fffffff; + vegas_enable(sk); +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct vegas *vegas = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} + +static void tcp_vegas_state(struct sock *sk, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + vegas_enable(sk); + else + vegas_disable(sk); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || + event == CA_EVENT_TX_START) + tcp_vegas_init(sk); +} + +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct vegas *vegas = inet_csk_ca(sk); + + if (!vegas->doing_vegas_now) + return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = vegas->beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_una = vegas->beg_snd_nxt; + vegas->beg_snd_nxt = tp->snd_nxt; + vegas->beg_snd_cwnd = tp->snd_cwnd; + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + target_cwnd = ((old_wnd * vegas->baseRTT) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* Slow start. */ + if (diff > gamma) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + (target_cwnd >> + V_PARAM_SHIFT)+1); + + } + tcp_slow_start(tp); + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > vbeta) { + /* The old window was too fast, so + * we slow down. + */ + next_snd_cwnd = old_snd_cwnd - 1; + } else if (diff < valpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + next_snd_cwnd = old_snd_cwnd + 1; + } else { + /* Sending just as fast as we + * should be. + */ + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ + if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; + } + + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + /* Use normal slow start */ + else if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + +} + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_vegas_get_info(struct sock *sk, u32 ext, + struct sk_buff *skb) +{ + const struct vegas *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info *info; + + info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, + sizeof(*info))); + + info->tcpv_enabled = ca->doing_vegas_now; + info->tcpv_rttcnt = ca->cntRTT; + info->tcpv_rtt = ca->baseRTT; + info->tcpv_minrtt = ca->minRTT; + rtattr_failure: ; + } +} + +static struct tcp_congestion_ops tcp_vegas = { + .init = tcp_vegas_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_vegas_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_vegas_rtt_calc, + .set_state = tcp_vegas_state, + .cwnd_event = tcp_vegas_cwnd_event, + .get_info = tcp_vegas_get_info, + + .owner = THIS_MODULE, + .name = "vegas", +}; + +static int __init tcp_vegas_register(void) +{ + BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_vegas); + return 0; +} + +static void __exit tcp_vegas_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_vegas); +} + +module_init(tcp_vegas_register); +module_exit(tcp_vegas_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Vegas"); diff -urN ns-2.31-orig/tcp/linux/tcp_veno.c ns-2.31-linux/tcp/linux/tcp_veno.c --- ns-2.31-orig/tcp/linux/tcp_veno.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_veno.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,256 @@ +/* + * TCP Veno congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * C. P. Fu, S. C. Liew. + * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." + * IEEE Journal on Selected Areas in Communication, + * Feb. 2003. + * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + */ +/* +#include +#include +#include +#include +#include + +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" + +/* Default values of the Veno variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static int vnbeta = 3<doing_veno_now = 1; + + veno->minrtt = 0x7fffffff; +} + +static inline void veno_disable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn off Veno */ + veno->doing_veno_now = 0; +} + +static void tcp_veno_init(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + veno->basertt = 0x7fffffff; + veno->inc = 1; + veno->cntrtt = 0; + veno_enable(sk); +} + +/* Do RTT sampling needed for Veno. */ +static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct veno *veno = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < veno->basertt) + veno->basertt = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + veno->minrtt = min(veno->minrtt, vrtt); + veno->cntrtt++; +} + +static void tcp_veno_state(struct sock *sk, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + veno_enable(sk); + else + veno_disable(sk); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Veno calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Veno state to a clean + * state. After we get acks for this flight of + * packets, _then_ we can make Veno calculations + * again. + */ +static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || + event == CA_EVENT_TX_START) + tcp_veno_init(sk); +} + +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (!veno->doing_veno_now) + return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + + /* limited by applications */ + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* We do the Veno calculations only if we got enough RTT samples */ + if (veno->cntrtt <= 2) { + /* We don't have enough RTT samples to do the Veno + * calculation, so we'll behave like Reno. + */ + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + } else { + u32 rtt, target_cwnd; + + /* We have enough RTT samples, so, using the Veno + * algorithm, we determine the state of the network. + */ + + rtt = veno->minrtt; + + target_cwnd = ((tp->snd_cwnd * veno->basertt) + << V_PARAM_SHIFT) / rtt; + + veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* Slow start. */ + tcp_slow_start(tp); + } + else if (sysctl_tcp_abc) { + /* RFC3465: Apppriate Byte Count + * increase once for each full cwnd acked. + * Veno has no idear about it so far, so we keep + * it as Reno. + */ + if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { + tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + }else { + /* Congestion avoidance. */ + if (veno->diff < vnbeta) { + /* In the "non-congestive state", increase cwnd + * every rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } else { + /* In the "congestive state", increase cwnd + * every other rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (veno->inc && tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + veno->inc = 0; + } + else + veno->inc = 1; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + + } + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } + /* Wipe the state clean for the next RTT. */ + veno->minrtt = 0x7fffffff; +} + +/* Veno MD phase */ +u32 tcp_veno_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + if(veno->diff < vnbeta) { + /* in "non-congestive state", cut cwnd by 1/5 */ + return max(tp->snd_cwnd*4/5, 2U); + }else { + /* in "congestive state", cut cwnd by 1/2 */ + return max(tp->snd_cwnd >> 1U, 2U); + } +} + +u32 tcp_veno_min_cwnd(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return tp->snd_ssthresh; +} + + +static struct tcp_congestion_ops tcp_veno = { + .init = tcp_veno_init, + .ssthresh = tcp_veno_ssthresh, + .cong_avoid = tcp_veno_cong_avoid, + .min_cwnd = tcp_veno_min_cwnd, + .rtt_sample = tcp_veno_rtt_calc, + .set_state = tcp_veno_state, + .cwnd_event = tcp_veno_cwnd_event, + + .owner = THIS_MODULE, + .name = "veno", +}; + +static int __init tcp_veno_register(void) +{ + BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_veno); + return 0; +} + +static void __exit tcp_veno_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_veno); +} + +module_init(tcp_veno_register); +module_exit(tcp_veno_unregister); + +MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Veno"); diff -urN ns-2.31-orig/tcp/linux/tcp_westwood.c ns-2.31-linux/tcp/linux/tcp_westwood.c --- ns-2.31-orig/tcp/linux/tcp_westwood.c 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/tcp_westwood.c 2007-07-18 15:35:46.000000000 -0700 @@ -0,0 +1,265 @@ +/* + * TCP Westwood+ + * + * Angelo Dell'Aera: TCP Westwood+ support + */ +/* +#include +#include +#include +#include +#include +#include +*/ +#include "ns-linux-c.h" +#include "ns-linux-util.h" +/* TCP Westwood structure */ +struct westwood { + u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ + u32 bw_est; /* bandwidth estimate */ + u32 rtt_win_sx; /* here starts a new evaluation... */ + u32 bk; + u32 snd_una; /* used for evaluating the number of acked bytes */ + u32 cumul_ack; + u32 accounted; + u32 rtt; + u32 rtt_min; /* minimum observed RTT */ +}; + + +/* TCP Westwood functions and constants */ +#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ +#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ + +/* + * @tcp_westwood_create + * This function initializes fields used in TCP Westwood+, + * it is called after the initial SYN, so the sequence numbers + * are correct but new passive connections we have no + * information about RTTmin at this time so we simply set it to + * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative + * since in this way we're sure it will be updated in a consistent + * way as soon as possible. It will reasonably happen within the first + * RTT period of the connection lifetime. + */ +static void tcp_westwood_init(struct sock *sk) +{ + struct westwood *w = inet_csk_ca(sk); + + w->bk = 0; + w->bw_ns_est = 0; + w->bw_est = 0; + w->accounted = 0; + w->cumul_ack = 0; + w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; + w->rtt_win_sx = tcp_time_stamp; + w->snd_una = tcp_sk(sk)->snd_una; +} + +/* + * @westwood_do_filter + * Low-pass filter. Implemented using constant coefficients. + */ +static inline u32 westwood_do_filter(u32 a, u32 b) +{ + return (((7 * a) + b) >> 3); +} + +static inline void westwood_filter(struct westwood *w, u32 delta) +{ + w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); + w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); +} + +/* + * @westwood_pkts_acked + * Called after processing group of packets. + * but all westwood needs is the last sample of srtt. + */ +static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt) +{ + struct westwood *w = inet_csk_ca(sk); + if (cnt > 0) + w->rtt = tcp_sk(sk)->srtt >> 3; +} + +/* + * @westwood_update_window + * It updates RTT evaluation window if it is the right moment to do + * it. If so it calls filter for evaluating bandwidth. + */ +static void westwood_update_window(struct sock *sk) +{ + struct westwood *w = inet_csk_ca(sk); + s32 delta = tcp_time_stamp - w->rtt_win_sx; + + /* + * See if a RTT-window has passed. + * Be careful since if RTT is less than + * 50ms we don't filter but we continue 'building the sample'. + * This minimum limit was chosen since an estimation on small + * time intervals is better to avoid... + * Obviously on a LAN we reasonably will always have + * right_bound = left_bound + WESTWOOD_RTT_MIN + */ + if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { + westwood_filter(w, delta); + + w->bk = 0; + w->rtt_win_sx = tcp_time_stamp; + } +} + +/* + * @westwood_fast_bw + * It is called when we are in fast path. In particular it is called when + * header prediction is successful. In such case in fact update is + * straight forward and doesn't need any particular care. + */ +static inline void westwood_fast_bw(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); + + westwood_update_window(sk); + + w->bk += tp->snd_una - w->snd_una; + w->snd_una = tp->snd_una; + w->rtt_min = min(w->rtt, w->rtt_min); +} + +/* + * @westwood_acked_count + * This function evaluates cumul_ack for evaluating bk in case of + * delayed or partial acks. + */ +static inline u32 westwood_acked_count(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); + + w->cumul_ack = tp->snd_una - w->snd_una; + + /* If cumul_ack is 0 this is a dupack since it's not moving + * tp->snd_una. + */ + if (!w->cumul_ack) { + w->accounted += tp->mss_cache; + w->cumul_ack = tp->mss_cache; + } + + if (w->cumul_ack > tp->mss_cache) { + /* Partial or delayed ack */ + if (w->accounted >= w->cumul_ack) { + w->accounted -= w->cumul_ack; + w->cumul_ack = tp->mss_cache; + } else { + w->cumul_ack -= w->accounted; + w->accounted = 0; + } + } + + w->snd_una = tp->snd_una; + + return w->cumul_ack; +} + +static inline u32 westwood_bw_rttmin(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct westwood *w = inet_csk_ca(sk); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); +} + +/* + * TCP Westwood + * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it + * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 + * so avoids ever returning 0. + */ +static u32 tcp_westwood_cwnd_min(struct sock *sk) +{ + return westwood_bw_rttmin(sk); +} + +static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); + + switch(event) { + case CA_EVENT_FAST_ACK: + westwood_fast_bw(sk); + break; + + case CA_EVENT_COMPLETE_CWR: + tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); + break; + + case CA_EVENT_FRTO: + tp->snd_ssthresh = westwood_bw_rttmin(sk); + break; + + case CA_EVENT_SLOW_ACK: + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); + w->rtt_min = min(w->rtt, w->rtt_min); + break; + + default: + /* don't care */ + break; + } +} + + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_westwood_info(struct sock *sk, u32 ext, + struct sk_buff *skb) +{ + const struct westwood *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct rtattr *rta; + struct tcpvegas_info *info; + + rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info)); + info = RTA_DATA(rta); + info->tcpv_enabled = 1; + info->tcpv_rttcnt = 0; + info->tcpv_rtt = jiffies_to_usecs(ca->rtt); + info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); + rtattr_failure: ; + } +} + + +static struct tcp_congestion_ops tcp_westwood = { + .init = tcp_westwood_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_westwood_cwnd_min, + .cwnd_event = tcp_westwood_event, + .get_info = tcp_westwood_info, + .pkts_acked = tcp_westwood_pkts_acked, + + .owner = THIS_MODULE, + .name = "westwood" +}; + +static int __init tcp_westwood_register(void) +{ + BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_westwood); +} + +static void __exit tcp_westwood_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_westwood); +} + +module_init(tcp_westwood_register); +module_exit(tcp_westwood_unregister); + +MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Westwood+"); diff -urN ns-2.31-orig/tcp/linux/validation/run-linux.csh ns-2.31-linux/tcp/linux/validation/run-linux.csh --- ns-2.31-orig/tcp/linux/validation/run-linux.csh 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/validation/run-linux.csh 2007-07-18 16:54:56.000000000 -0700 @@ -0,0 +1,76 @@ +nsdir="/home/cesar/ns-allinone-2.31/ns-2.31/" +for flownum in 1 +#2 8 32 128 +do + + for bw in 100 +#1 10 100 1000 + do + #sidebw=`echo "$bw*4" | bc` + sidebw=1000 + for onewaydelay in 64 +#2 8 32 128 + do + #buffer=`echo "$bw*$onewaydelay*25/1448" | bc` + #if [ $buffer -lt 100 ] + #then + # buffer=100 + #fi + # BDP = Mbps * ms *2 = Kbps * s * 2 = Kb *2 = Kb * 2 /8bpB / 1448Bp pkt = 1000*2/8*1448=250/1448 + # We use 1/10 of BDP + buffer=220 + for endtime in 900 +#20 200 + do + for i in highspeed reno htcp cubic hybla westwood bic vegas scalable +#highspeed reno vegas bic htcp cubic westwood hybla scalable +#htcp westwood cubic highspeed reno vegas + do + dirname=$flownum-$bw-$onewaydelay-$endtime-$i +# rm $dirname -r + mkdir $dirname + cd $dirname + echo "Agent/TCP/Linux" > config + echo $i >> config + echo $flownum >> config + echo $bw"Mb" >> config + echo $onewaydelay"ms" >> config + echo $buffer >> config + echo $sidebw"Mb" >> config + echo $endtime >> config + + sttime=`cat /proc/uptime | awk '{print $1}'` + date > time_report + $nsdir/ns ../test-linux.tcl > txt + edtime=`cat /proc/uptime | awk '{print $1}'` + date >> time_report + echo "$edtime - $sttime" | bc >> time_report + cat result0 | awk 'BEGIN{old=0}{print $1, ($3-old)*1448*8*2}{old=$3}' > rate0 + + echo " + set term png + set size 0.7,0.7 + set output '../"$dirname"_result0.png' + set xlabel 'time (s)' + set ylabel 'Cwnd (segments)' + set title '"$dirname"' + plot [0:120][] 'result0' u 1:2 title 'Flow 1' w lp 1 + " | gnuplot; + + echo " + set term png + set size 0.7,0.7 + set output '../"$dirname"_rate0.png' + set logscale y + set xlabel 'time (s)' + set ylabel 'Goodput (Mbps)' + set title '"$dirname"' + plot [0:120][] 'rate0' u 1:2 title 'Flow 1' w lp 1 + " | gnuplot; + + cd .. + done + done + done + done +done diff -urN ns-2.31-orig/tcp/linux/validation/test-linux.tcl ns-2.31-linux/tcp/linux/validation/test-linux.tcl --- ns-2.31-orig/tcp/linux/validation/test-linux.tcl 1969-12-31 16:00:00.000000000 -0800 +++ ns-2.31-linux/tcp/linux/validation/test-linux.tcl 2007-07-18 16:48:34.000000000 -0700 @@ -0,0 +1,134 @@ +set TCP_Variant "Agent/TCP/Fack1" +set TCP_ACK_Variant "Agent/TCPSink/Sack1/DelAck" +set FlowNumber 1 +set MainBW "1000Mb" +set SideBW "4000Mb" +#should be 4 times of Si