Commit fcc80cee authored by David Johnson's avatar David Johnson

Add an optional arg to TBSetNodeEventState so it can be nonfatal.

Use that in the libosload_virtnode package to retry on failures, instead
of aborting.  This is a cheesy retry strategy to prevent failures on
multi-thousand vnode topologies.

(Default behavior without the arg is still fatal.)
parent a1056149
......@@ -932,24 +932,32 @@ sub TBSetNodeLogEntry($$$$)
#
# Set event state for a node.
#
# usage: TBSetNodeEventState(char *node, char *state)
# usage: TBSetNodeEventState(char *node, char *state; int fatal)
# Returns 1 if okay.
# Returns 0 if failed.
#
sub TBSetNodeEventState($$)
sub TBSetNodeEventState($$;$)
{
my ($node, $state) = @_;
my ($node, $state, $fatal) = @_;
#
# If using the event system, we send out an event for the state daemon to
# pick up. Otherwise, we just set the state in the database ourselves
#
require event;
return event::EventSendFatal(objtype => TBDB_TBEVENT_NODESTATE,
objname => $node,
eventtype => $state,
host => $BOSSNODE);
if (!defined($fatal) || $fatal) {
return event::EventSendFatal(objtype => TBDB_TBEVENT_NODESTATE,
objname => $node,
eventtype => $state,
host => $BOSSNODE);
}
else {
return event::EventSendWarn(objtype => TBDB_TBEVENT_NODESTATE,
objname => $node,
eventtype => $state,
host => $BOSSNODE);
}
}
#
......
......@@ -2807,8 +2807,21 @@ sub SetupReload($$)
}
}
#
# Need to kick virtnodes so stated picks up the next_op_mode from os_select
TBSetNodeEventState($node_id,TBDB_NODESTATE_SHUTDOWN);
#
# We attempt to re-send failed event sends; failures can be a problem here
# in multi-thousand vnode experiments. Definitely a bit of a hack, but
# the sending rate is highest here.
#
my $chances = 8;
while ($chances > 0
&& !TBSetNodeEventState($node_id,TBDB_NODESTATE_SHUTDOWN,0)) {
tbwarn("$self SetupReload: TBSetNodeEventState failed; waiting and".
" retrying.");
sleep(4);
$chances -= 1;
}
return 0;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment