#!/bin/sh
# PCP QA Test No. 1661
# Test pmproxy libpcp_web discovery file descriptor leaks
#
# Copyright (c) 2019 Red Hat.  All Rights Reserved.
#

seq=`basename $0`
echo "QA output created by $seq"

# get standard environment, filters and checks
. ./common.product
. ./common.filter
. ./common.check

[ $PCP_PLATFORM = linux ] || _notrun "Test only runs on Linux"
which pmproxy >/dev/null 2>&1 || _notrun "No pmproxy binary installed"
_check_redis_server

_cleanup()
{
    echo;echo === cleaning up
    $sudo rm -rf "$dir"

    echo;echo === see $seq.full for logs 
    echo "+++ pmproxy.log +++" >>$seq.full
    cat $PCP_LOG_DIR/pmproxy/pmproxy.log >>$seq.full
    echo "+++ pmproxy.log.prev +++" >>$seq.full
    cat $PCP_LOG_DIR/pmproxy/pmproxy.log.prev >>$seq.full
    echo "+++ pmlogger.log +++" >>$seq.full
    cat $tmp.pmlogger.log >>$seq.full

    _restore_config $PCP_ETC_DIR/pcp/pmlogger
    _service pcp restart 2>&1 | _filter_pcp_stop | _filter_pcp_start
    _wait_for_pmcd
    _wait_for_pmlogger
    if $pmproxy_was_running
    then
	echo "Restart pmproxy ..." >>$seq.full
	_service pmproxy restart >>$seq.full 2>&1
	_wait_for_pmproxy
    else
	echo "Stopping pmproxy ..." >>$seq.full
	_service pmproxy stop >>$seq.full 2>&1
    fi
    $sudo rm -rf $tmp.*
}

_filter_pmproxy()
{
    sed \
	-e '/pmproxy: disabled time series, requires libuv support (missing)/d' \
    # end
}

_full_log()
{
    echo; echo $@ | tee -a $seq.full
}

# get current log volume of primary pmlogger, according to pmproxy
#
_log_volume()
{
    path=`pminfo -f pmcd.pmlogger.archive | sed -n -e '/"primary"]/{
s/"$//
s/.*"//
p
}'`
    echo "primary pmlogger archive path: $path" >>$seq.full
    echo "contents of /proc/$pmproxy_pid/fd/ ..." >>$seq.full
    $sudo ls -l /proc/$pmproxy_pid/fd/ | \
        tee -a $seq.full | \
	grep "$path" | \
	$PCP_AWK_PROG -F. '/pmlogger.'$LOCALHOST'.*.[0-9]$/ {print $NF; exit}'
}

status=1	# failure is the default!
dir="$PCP_ARCHIVE_DIR/qa$seq"
$sudo rm -rf $tmp $tmp.* $seq.full
trap "_cleanup; exit \$status" 0 1 2 3 15
src/time_stamp $tmp.stamp begin >>$seq.full

export PMLOGGER_CHECK_SKIP_LOGCONF=yes

# first remove all the old, stale, archives (the ones that might have
# accumulated for hosts other than this one during previous QA runs)
#
find $PCP_ARCHIVE_DIR -type f -mtime +30 \
| sed -e '/\/'"`hostname`"'\//d' \
| while read f
do
    $sudo rm -f "$f"
done
src/time_stamp $tmp.stamp 'cleanup old archives' >>$seq.full

pmproxy_was_running=false
[ -f $PCP_RUN_DIR/pmproxy.pid ] && pmproxy_was_running=true
echo "pmproxy_was_running=$pmproxy_was_running" >>$seq.full

# real QA test starts here
LOCALHOST=`hostname`

# only want the primary logger running
_save_config $PCP_ETC_DIR/pcp/pmlogger
_restore_pmlogger_control
_service pmlogger stop 2>&1 | _filter_pcp_stop

_full_log === restarting pmproxy service to ensure sane starting condition 
_service pmproxy restart 2>&1 | _filter_pcp_stop | _filter_pcp_start | _filter_pmproxy
_wait_for_pmproxy

pmproxy_pid=`_get_pids_by_name -a pmproxy`
[ -z "$pmproxy_pid" ] && echo === pmproxy not running && exit
date >>$seq.full
$PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]ID|[p]mproxy' >>$seq.full
echo "pmproxy_pid=$pmproxy_pid" >>$seq.full

#debug# echo "sudo gdb -p $pmproxy_pid"
#debug# read x
src/time_stamp $tmp.stamp 'restart pmproxy' >>$seq.full

_full_log === creating archive in new archive directory 
$sudo sh -c ". $PCP_SHARE_DIR/lib/rc-proc.sh; mkdir_and_chown $dir 775 $PCP_USER:$PCP_GROUP"
# make sure no archive files left here from a previous run of this test
$sudo rm -f $dir/testarchive.*
$sudo sh -c "pmlogger -s 10 -t 0.5sec -c config.default -l$tmp.pmlogger.log $dir/testarchive"
src/time_stamp $tmp.stamp 'pmlogger done' >>$seq.full

_full_log === checking pmproxy file descriptors for deleted files, should be none 
if [ -d /proc/$pmproxy_pid ]
then
    $sudo ls -l /proc/$pmproxy_pid/fd | grep 'testarchive.*(deleted)'
else
    echo "Arrgh: pmproxy pid=$pmproxy_pid has vanished" | tee -a $seq.full
    $PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]ID|[p]mproxy' >>$seq.full
    for log in $PCP_LOG_DIR/pmproxy/pmproxy.log*
    do
	if [ -f "$log" ]
	then
	    echo >>$seq.log
	    ls -l "$log" >>$seq.full
	    cat "$log" >>$seq.full
	else
	    echo "No pmproxy log files?" >>$seq.full
	    ls -ld $PCP_LOG_DIR/pmproxy >>$seq.full
	    ls -l $PCP_LOG_DIR/pmproxy >>$seq.full
	fi
    done
    exit
fi
src/time_stamp $tmp.stamp 'fd check' >>$seq.full

_full_log === compressing the new archives, which also deletes the originals 
for suff in 0 meta
do
    if [ -f $dir/testarchive.$suff ]
    then
	$sudo xz $dir/testarchive.$suff
    else
	echo "Error: $dir/testarchive.$suff missing!"
	ls -l $dir
    fi
    $sudo chown -R $PCP_USER $dir
done
src/time_stamp $tmp.stamp 'compression' >>$seq.full

_full_log === restarting pmlogger # primary only
_service pmlogger restart 2>&1 | _filter_pcp_start
_wait_for_pmlogger
src/time_stamp $tmp.stamp 'restart pmcd & pmlogger' >>$seq.full

_full_log === wait for pmproxy to process filesystem events

# unfortunately, it seems that simply finding all the archives
# may take a while ... and there is no externally visible pmproxy
# state we can wait on, so this host-specific heuristic is all I
# can do to stop the test consistently failing on specific hosts
# - kenj
#
case `hostname`
in
    bozo.localdomain|vm11.localdomain)
	    pmsleep 4
	    ;;
    *)
	    pmsleep 2
	    ;;
esac
src/time_stamp $tmp.stamp 'wait for pmproxy' >>$seq.full

_full_log === checking pmproxy file descriptors for deleted files, should be none 
if [ -d /proc/$pmproxy_pid ]
then
    $sudo ls -l /proc/$pmproxy_pid/fd | grep 'testarchive.*(deleted)'
else
    echo "Arrgh: pmproxy pid=$pmproxy_pid has vanished"
    exit
fi
src/time_stamp $tmp.stamp 'check for deleted files' >>$seq.full

_full_log === checking pmproxy correctly switches logvol when pmlogger bumps to a new logvol
retry=0
while true; do
    curlogvol=`_log_volume`
    echo "curlogvol=$curlogvol" >>$seq.full
    if [ -n "$curlogvol" ]
    then
	break
    else
	if [ $retry -gt 5 ]
	then
	    echo "Arrgh ... curlogvol not found from ..."
	    $sudo ls -l /proc/$pmproxy_pid/fd/
	    echo "pmproxy_pid=$pmproxy_pid"
	    $PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]ID|[p]mproxy'
	    exit
	else
	    retry=`expr $retry + 1`
	    sleep 2
	fi
    fi
done
src/time_stamp $tmp.stamp 'found old volume #' >>$seq.full
pminfo -f pmproxy.discover.logvol.change_vol >>$seq.full 2>&1
( echo 'new volume'; echo 'log mandatory on once { sample.long.one }' ) | $sudo pmlc -P >> $seq.full 2>&1
# make sure pmproxy has noticed ...
# there is no apparent way to make this deterministic, see comment above
#
case `hostname`
in
    bozo.localdomain|vm11.localdomain)
	    pmsleep 4
	    ;;
    *)
	    pmsleep 2
	    ;;
esac
pminfo -f pmproxy.discover.logvol.change_vol >>$seq.full 2>&1
newlogvol=`_log_volume`
echo "newlogvol=$newlogvol" >>$seq.full
if [ -z "$newlogvol" ]
then
    echo "Arrgh ... newlogvol not found from ..."
    $sudo ls -l /proc/$pmproxy_pid/fd/
    echo "pmproxy_pid=$pmproxy_pid"
    $PCP_PS_PROG $PCP_PS_ALL_FLAGS | grep -E '[P]ID|[p]mproxy'
    exit
fi
if [ `expr $curlogvol + 1` -ne $newlogvol ]
then
    echo FAILED curlogvol=$curlogvol newlogvol=$newlogvol
else
    echo passed
fi
src/time_stamp $tmp.stamp 'found new volume #' >>$seq.full

_full_log === checking pmproxy survives pmlogger restart
_service pmlogger restart 2>&1 | _filter_pcp_start
_wait_for_pmlogger
$sudo [ ! -d /proc/$pmproxy_pid/fd/ ] && echo FAIL && exit
echo passed
src/time_stamp $tmp.stamp 'check pmproxxy survives' >>$seq.full

# success, all done
status=0
exit
