Be less agressive about marking home servers as zombie.
authorAlan T. DeKok <aland@freeradius.org>
Wed, 4 Aug 2010 12:17:37 +0000 (14:17 +0200)
committerAlan T. DeKok <aland@freeradius.org>
Wed, 4 Aug 2010 13:30:50 +0000 (15:30 +0200)
Instead of marking them as zombie as soon as ONE packet doesn't
receive a response, mark then as zombie when we have received NO
responses for (zombie_period / 4)

src/include/realms.h
src/main/event.c
src/main/realms.c

index 506d087..9a590e9 100644 (file)
@@ -60,6 +60,7 @@ typedef struct home_server {
        int             currently_outstanding;
        int             message_authenticator;
 
+       time_t          last_packet;
        struct timeval  revive_time;
        struct timeval  zombie_period_start;
        int             zombie_period; /* unresponsive for T, mark it dead */
index 456a218..f2323b5 100644 (file)
@@ -1141,33 +1141,53 @@ static void no_response_to_proxied_request(void *ctx)
        }
 #endif
 
-       if (home->state == HOME_STATE_IS_DEAD) {
-               rad_assert(home->ev != NULL); /* or it will never wake up */
+       /*
+        *      If it's not alive, don't try to make it a zombie.
+        */
+       if (home->state != HOME_STATE_ALIVE) {
+               /*
+                *      Don't check home->ev due to race conditions.
+                */
                return;
        }
 
        /*
-        *      Enable the zombie period when we notice that the home
-        *      server hasn't responded.  We do NOT back-date the start
-        *      of the zombie period.
+        *      We've received a real packet recently.  Don't mark the
+        *      server as zombie until we've received NO packets for a
+        *      while.  The "1/4" of zombie period was chosen rather
+        *      arbitrarily.  It's a balance between too short, which
+        *      gives quick fail-over and fail-back, or too long,
+        *      where the proxy still sends packets to an unresponsive
+        *      home server.
         */
-       if (home->state == HOME_STATE_ALIVE) {
-               home->state = HOME_STATE_ZOMBIE;
-               home->zombie_period_start = now;        
-               fr_event_delete(el, &home->ev);
-               home->currently_outstanding = 0;
-               home->num_received_pings = 0;
-
-               radlog(L_PROXY, "Marking home server %s port %d as zombie (it looks like it is dead).",
-                      inet_ntop(home->ipaddr.af, &home->ipaddr.ipaddr,
-                                buffer, sizeof(buffer)),
-                      home->port);
-
-               /*
-                *      Start pinging the home server.
-                */
-               ping_home_server(home);
+       if ((home->last_packet + ((home->zombie_period + 3) / 4)) >= now.tv_sec) {
+               return;
        }
+
+       /*
+        *      Enable the zombie period when we notice that the home
+        *      server hasn't responded for a while.  We back-date the
+        *      zombie period to when we last received a response from
+        *      the home server.
+        */
+       home->state = HOME_STATE_ZOMBIE;
+       
+       home->zombie_period_start.tv_sec = home->last_packet;
+       home->zombie_period_start.tv_sec = USEC / 2;
+       
+       fr_event_delete(el, &home->ev);
+       home->currently_outstanding = 0;
+       home->num_received_pings = 0;
+       
+       radlog(L_PROXY, "Marking home server %s port %d as zombie (it looks like it is dead).",
+              inet_ntop(home->ipaddr.af, &home->ipaddr.ipaddr,
+                        buffer, sizeof(buffer)),
+              home->port);
+       
+       /*
+        *      Start pinging the home server.
+        */
+       ping_home_server(home);
 }
 #endif
 
@@ -3198,8 +3218,11 @@ REQUEST *received_proxy_response(RADIUS_PACKET *packet)
         *      receive a packet?  Setting this here means that we
         *      mark it alive on *any* packet, even if it's lost all
         *      of the *other* packets in the last 10s.
+        *
+        *      This behavior could be configurable.
         */
        request->home_server->state = HOME_STATE_ALIVE;
+       request->home_server->last_packet = now.tv_sec;
        
 #ifdef WITH_COA
        /*
index e1f8e6d..97f8acb 100644 (file)
@@ -417,6 +417,13 @@ static int home_server_add(realm_config_t *rc, CONF_SECTION *cs, int pool_type)
        home->name = name2;
        home->cs = cs;
 
+        /*
+        *      For zombie period calculations.  We want to count
+        *      zombies from the time when the server starts, instead
+        *      of from 1970.
+        */
+       home->last_packet = time(NULL);
+
        /*
         *      Authentication servers have a default "no_response_fail = 0".
         *      Accounting servers have a default "no_response_fail = 1".