The monitoring database servers in the current hosted model run
periodic scripts for the following:
1) Set date_completed on CURRENT_ALERTS table.
Under some cirsumstances, it's possible for the notification
system to fail to set date_completed on the current_alerts
table. We periodically run the following:
UPDATE current_alerts
SET date_completed = sysdate,in_progress = '0'
WHERE date_completed is null
AND date_submitted < sysdate - 3/24;
2) Delete old records from the CURRENT_ALERTS table.
This is really just:
DELETE FROM current_alerts
WHERE sysdate - date_completed < NUMBER_OF_DAYS
for some reasonable NUMBER_OF_DAYS. In the Triumph product,
since the CURRENT_ALERTS table is invisible to the user, we
should probably run this daily with NUMBER_OF_DAYS = 1.
3) Execute the synch_probe_state stored procedure.
We run the following every 60 seconds:
PROCEDURE synch_probe_state IS
BEGIN
--Set old data to PENDING status. "Old" means more than three check
intervals si
nce
--an update, though nothing is deleted that is less than 15 minutes old.
update probe_state set state = 'PENDING', output = 'Awaiting update'
where last
_check < (
select (sysdate - greatest(15 / 60 / 24, ((3 *
deployed_probe.check_interval_m
inutes) / 60 / 24)))
from deployed_probe
where deployed_probe.recid = probe_state.probe_id
);
--Delete current state for probes that no longer exist.
delete from probe_state
where not exists (
select 1 from deployed_probe
where deployed_probe.recid = probe_state.probe_id
);
--Delete existing summaries.
delete from current_state_summaries;
-- update multi_scout_threshold to the number of currently
-- valid state reporting scouts if threshold_all is being used.
update multi_scout_threshold t
set (scout_warning_threshold, scout_critical_threshold)=(
select
decode(scout_warning_threshold_is_all,0,scout_warning_threshold,count
(scout_id)),
decode(scout_crit_threshold_is_all,0,scout_critical_threshold,count(s
cout_id))
from probe_state p
where t.probe_id=p.probe_id
and state in ('OK', 'WARNING', 'CRITICAL')
group by t.probe_id
);
--Calculate probe state counts.
insert into current_state_summaries(
customer_id,
template_id,
state,
state_count,
last_check)
select /* Satellite hosts and checks */
p.customer_id,
decode(p.probe_type, 'host', 'sat_host', 'check', 'sat_probe'),
nvl(state, 'NONE'),
count(state),
max(last_check)
from deployed_probe p, probe_state
where p.recid = probe_state.probe_id
and p.probe_type in ('host', 'check')
and p.os_id = 14
group by p.customer_id, p.probe_type, state
UNION
select /* Non-satellite hosts and checks, for valid satellites only */
p.customer_id,
decode(p.probe_type, 'host', 'host_probe', 'check', 'svc_probe'),
nvl(state, 'NONE'),
count(state),
max(last_check)
from deployed_probe p, probe_state
where p.recid = probe_state.probe_id
and p.sat_cluster_id = probe_state.scout_id
and p.probe_type in ('host', 'check')
and p.os_id != 14
group by p.customer_id, p.probe_type, state
UNION
select /* URLs */
p.customer_id,
decode(p.probe_type, 'url', 'll_url'),
nvl(c.state, 'NONE'),
count(c.state),
max(last_check)
from (
-- start with a table of all of the states.
select s.probe_id, state
from deployed_probe p, probe_state s
where p.recid=s.probe_id
and p.probe_type='url'
group by s.probe_id, state
UNION
-- Every Url with at least one ok, warn, or crit should start with
an OK ent
ry
select s.probe_id, 'OK' state
from deployed_probe p, probe_state s
where p.recid=s.probe_id
and p.probe_type='url'
and state in ('WARNING', 'CRITICAL')
group by s.probe_id, state
MINUS
-- URLs that have ok, warning or critical should never be unknown
or pending
.
select cs.probe_id, state from (
select distinct probe_id
from deployed_probe p, probe_state s
where p.recid=s.probe_id
and p.probe_type='url'
and s.state in ('OK', 'WARNING', 'CRITICAL')
) cs, (
select 'PENDING' state from dual
UNION ALL
select 'UNKNOWN' state from dual
) ps
MINUS
-- A url that has fewer criticals than the threshold is not critical
select ms.probe_id, 'CRITICAL' from (
select probe_id, count(scout_id) state_count
from deployed_probe p, probe_state s
where p.recid=s.probe_id
and p.probe_type='url'
and s.state='CRITICAL'
group by probe_id
) ps, multi_scout_threshold ms
where ps.probe_id=ms.probe_id
and ps.state_count<ms.scout_critical_threshold
MINUS
-- A url that has fewer warnings + criticals than the threshold is
not warni
ng
select ms.probe_id, 'WARNING' from (
select probe_id, count(scout_id) state_count
from deployed_probe p, probe_state s
where p.recid=s.probe_id
and p.probe_type='url'
and s.state in ('WARNING','CRITICAL')
group by probe_id
) ps, multi_scout_threshold ms
where ps.probe_id=ms.probe_id
and ps.state_count<ms.scout_warning_threshold
MINUS
-- False warnings and false criticals have been eliminated.
-- A url that is critical is not ok, pending, warning or unknown.
select cs.probe_id, state from (
select probe_id, count(scout_id) state_count
from deployed_probe p, probe_state s
where p.recid=s.probe_id
and p.probe_type='url'
and s.state='CRITICAL'
group by probe_id
) cs, (
select 'OK' state from dual
UNION ALL
select 'WARNING' state from dual
UNION ALL
select 'PENDING' state from dual
UNION ALL
select 'UNKNOWN' state from dual
) ps,
multi_scout_threshold ms
where cs.probe_id=ms.probe_id
and cs.state_count>=ms.scout_critical_threshold
MINUS
-- A url that is warning is not ok, pending or unknown.
select cs.probe_id, state from (
select probe_id, count(scout_id) state_count
from deployed_probe p, probe_state s
where p.recid=s.probe_id
and p.probe_type='url'
and s.state in ('CRITICAL', 'WARNING')
group by probe_id
) cs, (
select 'OK' state from dual
UNION ALL
select 'PENDING' state from dual
UNION ALL
select 'UNKNOWN' state from dual
) ps,
multi_scout_threshold ms
where cs.probe_id=ms.probe_id
and cs.state_count>=ms.scout_warning_threshold
MINUS
-- A url that is unknown is not pending
select distinct s.probe_id, 'PENDING' state from probe_state s,
deployed_pro
be p
where p.probe_type='url'
and s.probe_id=p.recid
and state='UNKNOWN'
) c,
(
select probe_id, max(last_check) last_check from probe_state s,
deployed_pro
be p
where probe_id=recid
and probe_type='url' group by probe_id
) l,
deployed_probe p
where c.probe_id=p.recid
and c.probe_id=l.probe_id
group by p.customer_id, p.probe_type, c.state;
--Get the last satellite check time.
insert into current_state_summaries(
customer_id,
template_id,
state,
state_count,
last_check)
select
nvl(min(deployed_probe.customer_id), 0),
'satellite',
'NONE',
0,
max(last_check)
from satellite_state, deployed_probe
where satellite_id = deployed_probe.sat_cluster_id
group by deployed_probe.customer_id
;
END;

taskomatic is the right tool for these types of tasks
as for (1), though, sounds like a bug in other code; will it be fixed by karen's rewrite?
RHN::Task::* modules are taskomatic tasks; ::Test shows a trivial one and how to test
tasks. once written, the tasks need to be added to the appropriate conf files so as to run
on sats.
assigning back to dfaraldo since it's a perl task and I'm not sure who else otherwise would
do it

> as for (1), though, sounds like a bug in other code; will it be
fixed by karen's rewrite?
Actually, we'll be getting rid of the CURRENT_ALERTS table entirely at
some point. It was created so alerts could be failed over in case of
a catastrophic notification server failure, but in the real world,
it's better to let the renotification interval expire than to restart
an old alert. Until then, (1) is a safety measure to keep old alerts
from hanging around in case of such a failure.
> assigning back to dfaraldo since it's a perl task and I'm not sure
who else otherwise would do it
I'm on it like white on rice. (I had a TaskMaster bug earlier, so I'm
familiar with the system. :)

To test, configure probes and push config. If the tasks are running,
you should see state summaries under the Monitoring tab. That is, if
you click the Monitoring tab and see counts of probe states that are
nonzero, the tasks are running correctly.