commit resource-agents for openSUSE:Factory
Hello community, here is the log from the commit of package resource-agents for openSUSE:Factory checked in at 2015-04-30 11:49:47 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/resource-agents (Old) and /work/SRC/openSUSE:Factory/.resource-agents.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Package is "resource-agents" Changes: -------- --- /work/SRC/openSUSE:Factory/resource-agents/resource-agents.changes 2015-04-10 09:47:41.000000000 +0200 +++ /work/SRC/openSUSE:Factory/.resource-agents.new/resource-agents.changes 2015-04-30 11:49:48.000000000 +0200 @@ -1,0 +2,23 @@ +Tue Apr 21 06:11:26 UTC 2015 - kgronlund@suse.com + +- Update to version 3.9.6+git.1429568091.f79322c: + + Medium: multiple: make sure that the pidfile directory exist + + orainstance.sh: Fix process name grep in exit_idle + + Low: pgsql: fix check_wal_receiver to prevent incorrect "ERROR" status display and output WARN log in the master + + Low: allow rgmanager lvm agent to handle hostname aliases + + Fix rmq_join_list to only return online nodes + + Medium: pgsql: Support replication slots + + Low: pgsql: add validation check for replication slot. + + Low: redis: smarter config argument default + + Fix: redis: reliable shutdown. + + Low: redis: loosen advertised default monitor timeout seconds + + Fix: redis: do not attempt to demote if redis is dead + + Low: redis: make sure to always delete master score on stop + + Low: redis: reconnect to new master after promotion + + High: redis: only connect to active master instances + + High: redis: wait_last_known_master option for redis agent. + + Low: redis: prevent bash syntax errors and lower priority some log messages + + High: galera: retrieve last sequence number without using read-only mode + + Fix return code in asterisk_monitor #2 + +------------------------------------------------------------------- Old: ---- resource-agents-3.9.6+git.1427133197.6897c9c.tar.xz New: ---- resource-agents-3.9.6+git.1429568091.f79322c.tar.xz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ resource-agents.spec ++++++ --- /var/tmp/diff_new_pack.wdnVcs/_old 2015-04-30 11:49:49.000000000 +0200 +++ /var/tmp/diff_new_pack.wdnVcs/_new 2015-04-30 11:49:49.000000000 +0200 @@ -48,7 +48,7 @@ Summary: Open Source HA Reusable Cluster Resource Scripts License: GPL-2.0 and LGPL-2.1+ and GPL-3.0+ Group: Productivity/Clustering/HA -Version: 3.9.6+git.1427133197.6897c9c +Version: 3.9.6+git.1429568091.f79322c Release: 0 Url: http://linux-ha.org/ Source: resource-agents-%{version}.tar.xz ++++++ _servicedata ++++++ --- /var/tmp/diff_new_pack.wdnVcs/_old 2015-04-30 11:49:49.000000000 +0200 +++ /var/tmp/diff_new_pack.wdnVcs/_new 2015-04-30 11:49:49.000000000 +0200 @@ -1,4 +1,4 @@ <servicedata> <service name="tar_scm"> <param name="url">git://github.com/ClusterLabs/resource-agents.git</param> - <param name="changesrevision">6897c9c43948a71b865b8f9ca42103f180208f9d</param></service></servicedata> \ No newline at end of file + <param name="changesrevision">f79322cf2ce0094c266515e9242dbdbde3a7db72</param></service></servicedata> \ No newline at end of file ++++++ resource-agents-3.9.6+git.1427133197.6897c9c.tar.xz -> resource-agents-3.9.6+git.1429568091.f79322c.tar.xz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/apache new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/apache --- old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/apache 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/apache 2015-04-21 08:11:26.000000000 +0200 @@ -593,6 +593,7 @@ ocf_exit_reason "Configuration file $CONFIGFILE not found!" return $OCF_ERR_INSTALLED fi + ocf_mkstatedir root 755 `dirname $PidFile` || return $OCF_ERR_INSTALLED return $OCF_SUCCESS } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/asterisk new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/asterisk --- old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/asterisk 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/asterisk 2015-04-21 08:11:26.000000000 +0200 @@ -286,8 +286,13 @@ rc=$? if [ $rc -ne 0 ]; then - ocf_log err "Failed to connect to the Asterisk PBX" - return $OCF_ERR_GENERIC + if [ "$__OCF_ACTION" = "start" ]; then + ocf_log info "Asterisk PBX not running yet" + return $OCF_NOT_RUNNING; + else + ocf_log err "Failed to connect to the Asterisk PBX" + return $OCF_ERR_GENERIC; + fi fi # Optionally check the monitor URI with sipsak diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/galera new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/galera --- old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/galera 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/galera 2015-04-21 08:11:26.000000000 +0200 @@ -443,19 +443,18 @@ ocf_exit_reason "Failure, Attempted to promote Master instance of $OCF_RESOURCE_INSTANCE before bootstrap node has been detected." return $OCF_ERR_GENERIC fi - fi - # make sure the read only instance is stopped - mysql_common_stop - rc=$? - if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then - ocf_exit_reason "Failed to stop read-only galera instance during promotion to Master" - return $rc + galera_monitor + if [ $? -eq $OCF_RUNNING_MASTER ]; then + if ocf_is_true $bootstrap; then + promote_everyone + clear_bootstrap_node + ocf_log info "boostrap node already up, promoting the rest of the galera instances." + fi + return $OCF_SUCCESS fi - sleep 4 - mysql_common_prepare_dirs mysql_common_start "$extra_opts" rc=$? @@ -510,14 +509,14 @@ # if this node was previously a bootstrap node, that is no longer the case. clear_bootstrap_node + clear_last_commit - # start again in slave mode so the new last commit is recorded + # record last commit by "starting" galera. start is just detection of the last sequence number galera_start } galera_start() { - local extra_opts='--read-only=true' local last_commit echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME @@ -526,22 +525,39 @@ return $OCF_ERR_CONFIGURED fi - mysql_common_prepare_dirs - mysql_common_start "$extra_opts" - - is_readonly - if [ $? -ne 0 ]; then - ocf_exit_reason "Slave instance did not start correctly in read-only mode, Make sure local galera.cnf does not have wsrep_cluster_address set." + galera_monitor + if [ $? -eq $OCF_RUNNING_MASTER ]; then + ocf_exit_reason "master galera instance started outside of the cluster's control" return $OCF_ERR_GENERIC fi - ocf_log info "attempting to detect last commit version" - while [ -z "$last_commit" ]; do - last_commit=$(get_status_variable "wsrep_last_committed") - if [ -z "$last_commit" ]; then - sleep 1 + mysql_common_prepare_dirs + + ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat" + last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')" + if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then + ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" + local tmp=$(mktemp) + ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ + --pid-file=$OCF_RESKEY_pid \ + --socket=$OCF_RESKEY_socket \ + --datadir=$OCF_RESKEY_datadir \ + --user=$OCF_RESKEY_user \ + --wsrep-recover > $tmp 2>&1 + + last_commit="$(cat $tmp | sed -n 's/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p')" + rm -f $tmp + + if [ "$last_commit" = "-1" ]; then + last_commit="0" fi - done + fi + + if [ -z "$last_commit" ]; then + ocf_exit_reason "Unable to detect last known write sequence number" + clear_last_commit + return $OCF_ERR_GENERIC + fi ocf_log info "Last commit version found: $last_commit" set_last_commit $last_commit @@ -567,28 +583,40 @@ if ocf_is_probe; then status_loglevel="info" fi - + mysql_common_status $status_loglevel rc=$? - # If status returned an error, return that immediately - if [ $rc -ne $OCF_SUCCESS ]; then + if [ $rc -eq $OCF_NOT_RUNNING ]; then + last_commit=$(get_last_commit $node) + if [ -n "$last_commit" ]; then + # if last commit is set, this instance is considered started in slave mode + rc=$OCF_SUCCESS + master_exists + if [ $? -ne 0 ]; then + detect_first_master + else + # a master instance exists and is healthy, promote this + # local read only instance + # so it can join the master galera cluster. + set_master_score + fi + fi + return $rc + elif [ $rc -ne $OCF_SUCCESS ]; then return $rc fi + # if we make it here, mysql is running. Check cluster status now. + echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME if [ $? -ne 0 ]; then ocf_exit_reason "local node <${NODENAME}> is started, but is not a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>" return $OCF_ERR_GENERIC fi - is_readonly - if [ $? -ne 0 ]; then - is_primary - if [ $? -ne 0 ]; then - ocf_exit_reason "local node <${NODENAME}> is neither in primary mode nor in read_only mode. Unknown state." - return $OCF_ERR_GENERIC - fi + is_primary + if [ $? -eq 0 ]; then if ocf_is_probe; then # restore master score during probe @@ -596,18 +624,10 @@ set_master_score fi rc=$OCF_RUNNING_MASTER - else - master_exists - if [ $? -ne 0 ]; then - detect_first_master - else - # a master instance exists and is healthy, promote this - # local read only instance - # so it can join the master galera cluster. - set_master_score - fi + else + ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." + rc=$OCF_ERR_GENERIC fi - # TODO look at what is done in the wait script return $rc } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/named new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/named --- old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/named 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/named 2015-04-21 08:11:26.000000000 +0200 @@ -229,6 +229,9 @@ return $OCF_ERR_CONFIGURED fi + # make sure that the pidfile directory exists + ocf_mkstatedir $OCF_RESKEY_named_user 755 `dirname $OCF_RESKEY_named_pidfile` || return $OCF_ERR_INSTALLED + return $OCF_SUCCESS } @@ -487,3 +490,5 @@ *) exit $OCF_ERR_UNIMPLEMENTED;; esac + +# vim:ts=4:sw=4:et: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/ocf-shellfuncs.in new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/ocf-shellfuncs.in --- old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/ocf-shellfuncs.in 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/ocf-shellfuncs.in 2015-04-21 08:11:26.000000000 +0200 @@ -748,6 +748,87 @@ } # +# create a given status directory +# if the directory path doesn't start with $HA_VARRUN, then +# we return with error (most of the calls would be with the user +# supplied configuration, hence we need to do necessary +# protection) +# used mostly for PID files +# +# usage: ocf_mkstatedir owner permissions path +# +# owner: user.group +# permissions: permissions +# path: directory path +# +# example: +# ocf_mkstatedir named 755 `dirname $pidfile` +# +ocf_mkstatedir() +{ + local owner + local perms + local path + + owner=$1 + perms=$2 + path=$3 + + test -d $path && return 0 + [ $(id -u) = 0 ] || return 1 + + case $path in + $HA_VARRUN/*) : this path is ok ;; + *) ocf_log err "cannot create $path (does not start with $HA_VARRUN)" + return 1 + ;; + esac + + mkdir -p $path && + chown $owner $path && + chmod $perms $path +} + +# +# create a unique status directory in $HA_VARRUN +# used mostly for PID files +# the directory is by default set to +# $HA_VARRUN/$OCF_RESOURCE_INSTANCE +# the directory name is printed to stdout +# +# usage: ocf_unique_rundir owner permissions name +# +# owner: user.group (default: "root") +# permissions: permissions (default: "755") +# name: some unique string (default: "$OCF_RESOURCE_INSTANCE") +# +# to use the default either don't set the parameter or set it to +# empty string ("") +# example: +# +# STATEDIR=`ocf_unique_rundir named "" myownstatedir` +# +ocf_unique_rundir() +{ + local path + local owner + local perms + local name + + owner=${1:-"root"} + perms=${2:-"755"} + name=${3:-"$OCF_RESOURCE_INSTANCE"} + path=$HA_VARRUN/$name + if [ ! -d $path ]; then + [ $(id -u) = 0 ] || return 1 + mkdir -p $path && + chown $owner $path && + chmod $perms $path || return 1 + fi + echo $path +} + +# # RA tracing may be turned on by setting OCF_TRACE_RA # the trace output will be saved to OCF_TRACE_FILE, if set, or # by default to diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/pgsql new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/pgsql --- old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/pgsql 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/pgsql 2015-04-21 08:11:26.000000000 +0200 @@ -64,6 +64,7 @@ OCF_RESKEY_xlog_check_count_default="3" OCF_RESKEY_crm_attr_timeout_default="5" OCF_RESKEY_stop_escalate_in_slave_default=30 +OCF_RESKEY_replication_slot_name_default="" : ${OCF_RESKEY_pgctl=${OCF_RESKEY_pgctl_default}} : ${OCF_RESKEY_psql=${OCF_RESKEY_psql_default}} @@ -96,6 +97,7 @@ : ${OCF_RESKEY_xlog_check_count=${OCF_RESKEY_xlog_check_count_default}} : ${OCF_RESKEY_crm_attr_timeout=${OCF_RESKEY_crm_attr_timeout_default}} : ${OCF_RESKEY_stop_escalate_in_slave=${OCF_RESKEY_stop_escalate_in_slave_default}} +: ${OCF_RESKEY_replication_slot_name=${OCF_RESKEY_replication_slot_name_default}} usage() { cat <<EOF @@ -361,6 +363,25 @@ <content type="boolean" default="${OCF_RESKEY_restart_on_promote_default}" /> </parameter> +<parameter name="replication_slot_name" unique="0" required="0"> +<longdesc lang="en"> +Set this option when using replication slots. +When the master node has 1 slave node,one replication slot would be created with the name "replication_slot_name". +When the master node has 2 or more slave nodes,the replication slots would be created for each node, with the name adding the node name as postfix. +For example, replication_slot_name is "sample" and 2 slaves which are "node_a" and "node_b" connect to +their slots, the slots names are "sample_node_a" and "sample_node_b". + +pgsql RA doesn't monitor and delete the repliation slot. +When the slave node has been disconnected in failure or the like, execute one of the following manually. +Otherwise it may eventually cause a disk full because the master node will continue to accumulate the unsent WAL. +1. recover and reconnect the slave node to the master node as soon as possible. +2. delete the slot on the master node by following psql command. +$ select pg_drop_replication_slot('replication_slot_name'); +</longdesc> +<shortdesc lang="en">replication_slot_name</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_slot_name_default}" /> +</parameter> + <parameter name="tmpdir" unique="0" required="0"> <longdesc lang="en"> Path to temporary directory. @@ -404,7 +425,10 @@ If this is true, RA checks wal_receiver process on monitor and notifies its status using "(resource name)-receiver-status" attribute. It's useful for checking whether PostgreSQL (hot standby) connects to primary. -The attribute shows status as "normal" or "ERROR". +The attribute shows status as "normal" or "normal (master)" or "ERROR". +Note that if you configure PostgreSQL as master/slave resource, then +wal receiver is not running in the master and the attribute shows status as +"normal (master)" consistently because it is normal status. </longdesc> <shortdesc lang="en">check_wal_receiver</shortdesc> <content type="boolean" default="${OCF_RESKEY_check_wal_receiver_default}" /> @@ -562,6 +586,17 @@ ocf_log debug "PostgreSQL still hasn't started yet. Waiting..." done + # create replication slot on the master and slave nodes. + # creating slot on the slave node is in preparation for failover. + if use_replication_slot; then + create_replication_slot + rc=$? + if [ $rc -eq $OCF_ERR_GENERIC ]; then + ocf_exit_reason ocf_exit_reason "PostgreSQL can't create replication_slot." + return $OCF_ERR_GENERIC + fi + fi + ocf_log info "PostgreSQL is started." return $rc } @@ -820,13 +855,21 @@ pgsql_wal_receiver_status() { local PID local receiver_parent_pids + local pgsql_real_monitor_status=$1 PID=`head -n 1 $PIDFILE` receiver_parent_pids=`ps -ef | tr -s " " | grep "[w]al receiver process" | cut -d " " -f 3` + if echo "$receiver_parent_pids" | grep -q -w "$PID" ; then attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "normal" -q return 0 fi + + if [ $pgsql_real_monitor_status -eq "$OCF_RUNNING_MASTER" ]; then + attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "normal (master)" -q + return 0 + fi + attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "ERROR" -q ocf_log warn "wal receiver process is not running" return 1 @@ -850,10 +893,6 @@ return $OCF_NOT_RUNNING fi - if ocf_is_true ${OCF_RESKEY_check_wal_receiver}; then - pgsql_wal_receiver_status - fi - if is_replication; then #Check replication state output=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ @@ -943,6 +982,11 @@ pgsql_real_monitor rc=$? + + if ocf_is_true ${OCF_RESKEY_check_wal_receiver}; then + pgsql_wal_receiver_status $rc + fi + if ! is_replication; then return $rc else @@ -1226,6 +1270,82 @@ return 1 } +use_replication_slot() { + if [ -n "$OCF_RESKEY_replication_slot_name" ]; then + return 0 + fi + + return 1 +} + +create_replication_slot_name() { + local number_of_nodes=0 + local target + local replication_slot_name + local replication_slot_name_list_tmp + local replication_slot_name_list + + if [ -n "$NODE_LIST" ]; then + number_of_nodes=`echo $NODE_LIST | wc -w` + fi + + # If the number of nodes 2 or less, Master node has 1 or less Slave node. + # The Master node should have 1 slot for the Slave, which is named "$OCF_RES_KEY_replication_slot_name". + if [ $number_of_nodes -le 2 ]; then + replication_slot_name_list="$OCF_RESKEY_replication_slot_name" + + # If the number of nodes 3 or more, the Master has some Slave nodes. + # The Master node should have some slots equal to the number of Slaves, and + # the Slave nodes connect to their dedicated slot on the Master. + # To ensuring that the slots name are each unique, add postfix to $OCF_RESKEY_replication_slot. + # The postfix is "_$target". + else + for target in $NODE_LIST + do + if [ "$target" != "$NODENAME" ]; then + replication_slot_name="$OCF_RESKEY_replication_slot_name"_"$target" + replication_slot_name_list_tmp="$replication_slot_name_list" + replication_slot_name_list="$replication_slot_name_list_tmp $replication_slot_name" + fi + done + fi + + echo $replication_slot_name_list +} + +create_replication_slot() { + local replication_slot_name + local replication_slot_name_list + local output + local rc + local CREATE_REPLICATION_SLOT_sql + + replication_slot_name_list=`create_replication_slot_name` + ocf_log debug "replication slot names are $replication_slot_name_list." + + for replication_slot_name in $replication_slot_name_list + do + # create replication slot when the same name slot is not exists. + # If the same name slot is already exists, don't create new slot and reuse the old slot. + CREATE_REPLICATION_SLOT_sql="SELECT pg_create_physical_replication_slot('$replication_slot_name') \ + FROM (VALUES (1)) AS t \ + WHERE NOT EXISTS (SELECT * FROM pg_replication_slots WHERE slot_name = '$replication_slot_name');" + + output=`su $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ + $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ + -Atc \"$CREATE_REPLICATION_SLOT_sql\""` + rc=$? + if [ $rc -eq 0 ]; then + ocf_log info "PostgreSQL creates or alredy exist the replication slot($replication_slot_name)" + else + ocf_exit_reason "$output" + return $OCF_ERR_GENERIC + fi + done + + return 0 +} + get_my_location() { local rc local output @@ -1340,6 +1460,8 @@ } user_recovery_conf() { + local number_of_nodes + # put archive_cleanup_command and recovery_end_command only when defined by user if [ -n "$OCF_RESKEY_archive_cleanup_command" ]; then echo "archive_cleanup_command = '${OCF_RESKEY_archive_cleanup_command}'" @@ -1347,6 +1469,15 @@ if [ -n "$OCF_RESKEY_recovery_end_command" ]; then echo "recovery_end_command = '${OCF_RESKEY_recovery_end_command}'" fi + + if use_replication_slot; then + number_of_nodes=`echo $NODE_LIST | wc -w` + if [ $number_of_nodes -le 2 ]; then + echo "primary_slot_name = '${OCF_RESKEY_replication_slot_name}'" + else + echo "primary_slot_name = '${OCF_RESKEY_replication_slot_name}_$NODENAME'" + fi + fi } make_recovery_conf() { @@ -1688,6 +1819,14 @@ return $OCF_ERR_CONFIGURED fi fi + + if use_replication_slot; then + ocf_version_cmp "$version" "9.4" + if [ $? -eq 0 -o $? -eq 3 ]; then + ocf_exit_reason "Replication slot needs PostgreSQL 9.4 or higher." + return $OCF_ERR_CONFIGURED + fi + fi return $OCF_SUCCESS } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/rabbitmq-cluster new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/rabbitmq-cluster --- old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/rabbitmq-cluster 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/rabbitmq-cluster 2015-04-21 08:11:26.000000000 +0200 @@ -104,7 +104,7 @@ rmq_join_list() { - cibadmin -Q 2>/dev/null | grep "$RMQ_CRM_ATTR_COOKIE" | sed -n -e "s/^.*value=.\(.*\)\".*$/\1/p" + cibadmin -Q --xpath "//node_state[@crmd='online']//nvpair[@name='$RMQ_CRM_ATTR_COOKIE']" | grep "$RMQ_CRM_ATTR_COOKIE" | sed -n -e "s/^.*value=.\(.*\)\".*$/\1/p" } rmq_write_nodename() diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/redis new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/redis --- old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/redis 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/redis 2015-04-21 08:11:26.000000000 +0200 @@ -4,13 +4,22 @@ : ${OCF_RESKEY_bin:=/usr/bin/redis-server} : ${OCF_RESKEY_client_bin:=/usr/bin/redis-cli} -: ${OCF_RESKEY_config:=/etc/redis/redis.conf} : ${OCF_RESKEY_user:=redis} : ${OCF_RESKEY_rundir:=/var/run/redis} : ${OCF_RESKEY_pidfile_name:=redis-server.pid} : ${OCF_RESKEY_socket_name:=redis.sock} : ${OCF_RESKEY_port:=6379} +if [ -z "$OCF_RESKEY_config" ]; then + if [ -f "/etc/redis.conf" ]; then + OCF_RESKEY_config="/etc/redis.conf" + else + OCF_RESKEY_config="/etc/redis/redis.conf" + fi +fi + +CHECK_SLAVE_STATE=0 + REDIS_SERVER="$OCF_RESKEY_bin" REDIS_CLIENT="$OCF_RESKEY_client_bin" REDIS_CONFIG="$OCF_RESKEY_config" @@ -102,15 +111,26 @@ <shortdesc lang="en">Replication port</shortdesc> <content type="string" default="${OCF_RESKEY_port}"/> </parameter> + +<parameter name="wait_last_known_master" unique="0" required="0"> +<longdesc lang="en"> +During redis cluster bootstrap, wait for the last known master to be +promoted before allowing any other instances in the cluster to be +promoted. This lessens the risk of data loss when persistent data +is in use. +</longdesc> +<shortdesc lang="en">Wait for last known master</shortdesc> +<content type="boolean" default="false"/> +</parameter> </parameters> <actions> <action name="start" timeout="120" /> <action name="stop" timeout="120" /> <action name="status" timeout="60" /> -<action name="monitor" depth="0" timeout="30" interval="20" /> -<action name="monitor" role="Master" depth="0" timeout="30" interval="20" /> -<action name="monitor" role="Slave" depth="0" timeout="30" interval="60" /> +<action name="monitor" depth="0" timeout="60" interval="45" /> +<action name="monitor" role="Master" depth="0" timeout="60" interval="20" /> +<action name="monitor" role="Slave" depth="0" timeout="60" interval="60" /> <action name="promote" timeout="120" /> <action name="demote" timeout="120" /> <action name="notify" timeout="90" /> @@ -121,21 +141,110 @@ EOI } +INSTANCE_ATTR_NAME=`echo ${OCF_RESOURCE_INSTANCE}| awk -F : '{print $1}'` +CRM_ATTR_REPL_INFO="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_INFO -s redis_replication" +MASTER_HOST="" +MASTER_ACTIVE_CACHED="" +MASTER_ACTIVE="" + +master_is_active() +{ + if [ -z "$MASTER_ACTIVE_CACHED" ]; then + # determine if a master instance is already up and is healthy + crm_mon --as-xml | grep "resource.*id=\"${OCF_RESOURCE_INSTANCE}\".*role=\"Master\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" > /dev/null 2>&1 + MASTER_ACTIVE=$? + MASTER_ACTIVE_CACHED="true" + fi + return $MASTER_ACTIVE +} + +function set_master() +{ + MASTER_HOST="$1" + ${CRM_ATTR_REPL_INFO} -v "$1" -q +} + +function last_known_master() +{ + if [ -z "$MASTER_HOST" ]; then + MASTER_HOST="$(${CRM_ATTR_REPL_INFO} --query -q 2>/dev/null)" + fi + echo "$MASTER_HOST" +} + function crm_master_reboot() { "${HA_SBIN_DIR}/crm_master" -l reboot "$@" } +function calculate_score() +{ + perf_score="$1" + connected_clients="$2" + + if ocf_is_true "$OCF_RESKEY_wait_last_known_master"; then + # only set perferred score by slave_priority if + # we are not waiting for the last known master. Otherwise + # we want the agent to have complete control over the scoring. + perf_score="" + connected_clients="0" + fi + + if [[ -z "$perf_score" ]]; then + if [[ "$(last_known_master)" == "$NODENAME" ]]; then + perf_score=1000 + else + perf_score=1 + fi + fi + perf_score=$(( perf_score + connected_clients )) + echo "$perf_score" +} + +function set_score() +{ + local score="$1" + + if ocf_is_true "$OCF_RESKEY_wait_last_known_master" && ! master_is_active; then + local last_master="$(last_known_master)" + if [ -n "$last_master" ] && [[ "$last_master" != "$NODENAME" ]]; then + ocf_log info "Postponing setting master score for ${NODENAME} until last known master instance [${last_master}] is promoted" + return + fi + fi + + ocf_log debug "monitor: Setting master score to '$score'" + crm_master_reboot -v "$score" +} + function redis_client() { ocf_log debug "redis_client: '$REDIS_CLIENT' -s '$REDIS_SOCKET' $@" "$REDIS_CLIENT" -s "$REDIS_SOCKET" "$@" | sed 's/\r//' } -function monitor() { +function simple_status() { + local pid + + if ! [ -f "$REDIS_PIDFILE" ]; then + return $OCF_NOT_RUNNING + fi + pid="$(<"$REDIS_PIDFILE")" pidof "$REDIS_SERVER" | grep -q "\<$pid\>" || return $OCF_NOT_RUNNING ocf_log debug "monitor: redis-server running under pid $pid" + return $OCF_SUCCESS +} + +function monitor() { + local res + + simple_status + res=$? + if (( res != OCF_SUCCESS )); then + return $res + fi + typeset -A info while read line; do [[ "$line" == "#"* ]] && continue @@ -144,7 +253,6 @@ info[$key]="$value" done < <(redis_client info) if [[ -z "${info[role]}" ]]; then - pidof "$REDIS_SERVER" | grep -q "\<$pid\>" || return $OCF_NOT_RUNNING ocf_log err "monitor: Could not get role from \`$REDIS_CLIENT -s $REDIS_SOCKET info\`" return $OCF_ERR_GENERIC fi @@ -154,32 +262,26 @@ # If score isn't set we the redis setting 'slave_priority'. # If that isn't set, we default to 1000 for a master, and 1 for slave. # We then add 1 for each connected client - score="$(crm_master_reboot --get-value --quiet)" + score="$(crm_master_reboot --get-value --quiet 2>/dev/null)" if [[ -z "$score" ]]; then - score="${info[slave_priority]}" - if [[ -z "$score" ]]; then - if [[ "${info[role]}" == "master" ]]; then - score=1000 - else - score=1 - fi - fi - score=$(( score + info[connected_clients] )) - ocf_log debug "monitor: Setting master score to '$score'" - crm_master_reboot -v "$score" + score=$(calculate_score "${info[slave_priority]}" "${info[connected_clients]}") + set_score "$score" fi if [[ "${info[role]}" == "master" ]]; then + if ocf_is_probe; then + set_master "$NODENAME" + fi return $OCF_RUNNING_MASTER fi - if [[ -n "$CHECK_SLAVE_STATE" ]]; then + if [ "$CHECK_SLAVE_STATE" -eq 1 ]; then if [[ "${info[master_link_status]}" != "up" ]]; then - ocf_log err "monitor: Slave mode link has failed (link=${info[master_link_status]})" + ocf_log info "monitor: Slave mode link has not yet been established (link=${info[master_link_status]})" return $OCF_ERR_GENERIC fi - if [[ "${info[master_host]}" != "${OCF_RESKEY_CRM_meta_notify_master_uname}" ]]; then - ocf_log err "monitor: Slave mode current master does not match running master. current=${info[master_host]}, running=${OCF_RESKEY_CRM_meta_notify_master_uname}" + if [[ "${info[master_host]}" != "$(last_known_master)" ]]; then + ocf_log err "monitor: Slave mode current master does not match running master. current=${info[master_host]}, running=$(last_known_master)" return $OCF_ERR_GENERIC fi fi @@ -246,6 +348,7 @@ if (( status == OCF_NOT_RUNNING )); then ocf_log info "stop: redis is already stopped" + crm_master_reboot -D return $OCF_SUCCESS fi @@ -253,16 +356,12 @@ kill -TERM "$pid" while true; do - monitor + simple_status status=$? if (( status == OCF_NOT_RUNNING )); then crm_master_reboot -D return $OCF_SUCCESS fi - if (( status != OCF_RUNNING_MASTER )) && (( status != OCF_SUCCESS )) && (( status != OCF_ERR_GENERIC )); then # we allow OCF_ERR_GENERIC because monitor can generate an error if we probe redis in the middle of shutdown (the socket won't be responding but the process is up) - ocf_log err "stop: Unknown error while stopping" - return $OCF_ERR_GENERIC - fi sleep 1 done } @@ -273,6 +372,7 @@ if (( status == OCF_RUNNING_MASTER )); then ocf_log info "promote: Already running as master" + set_master "$NODENAME" return $OCF_SUCCESS elif (( status != OCF_SUCCESS )); then ocf_log err "promote: Node is not running as a slave" @@ -284,6 +384,7 @@ monitor status=$? if (( status == OCF_RUNNING_MASTER )); then + set_master "$NODENAME" return $OCF_SUCCESS fi @@ -292,22 +393,33 @@ } function demote() { - CHECK_SLAVE_STATE=1 monitor + local master_host + local master_port + + CHECK_SLAVE_STATE=1 + monitor status=$? if (( status == OCF_SUCCESS )); then ocf_log info "demote: Already running as slave" return $OCF_SUCCESS + elif (( status == OCF_NOT_RUNNING )); then + ocf_log err "demote: Failed to demote, redis not running." + return $OCF_NOT_RUNNING fi - master_host="${OCF_RESKEY_CRM_meta_notify_promote_uname// /}" - : "${master_host:=${OCF_RESKEY_CRM_meta_notify_master_uname// /}}" + master_host="$(last_known_master)" master_port="${REDIS_REPLICATION_PORT}" # The elected master has to remain a slave during startup. # During this period a placeholder master host is assigned. - current_host="$(crm_node -n)" - if [[ "$master_host" == "$current_host" ]]; then + if [ -z "$master_host" ] || [[ "$master_host" == "$NODENAME" ]]; then + CHECK_SLAVE_STATE=0 + master_host="no-such-master" + elif ! master_is_active; then + # no master has been promoted yet. we'll be notified when the + # master starts. + CHECK_SLAVE_STATE=0 master_host="no-such-master" fi @@ -315,12 +427,17 @@ redis_client slaveof "$master_host" "$master_port" - monitor - status=$? + # wait briefly for the slave to connect to the master + for (( c=1; c <= 20; c++ )) + do + monitor + status=$? + if (( status == OCF_SUCCESS )); then + return $OCF_SUCCESS + fi + sleep 1 + done - if (( status == OCF_SUCCESS )); then - return $OCF_SUCCESS - fi ocf_log err "demote: Unexpected error setting slave mode (status=$status)" return $OCF_ERR_GENERIC } @@ -332,6 +449,8 @@ monitor status=$? if (( status == OCF_SUCCESS )); then # were a slave + # calling demote updates the slave's connection + # to the newly appointed Master instance. demote fi ;; @@ -358,6 +477,8 @@ fi } +NODENAME=$(ocf_local_nodename) + ocf_log debug "action=${1:-$__OCF_ACTION} notify_type=${OCF_RESKEY_CRM_meta_notify_type} notify_operation=${OCF_RESKEY_CRM_meta_notify_operation} master_host=${OCF_RESKEY_CRM_meta_notify_master_uname} slave_host=${OCF_RESKEY_CRM_meta_notify_slave_uname} promote_host=${OCF_RESKEY_CRM_meta_notify_promote_uname} demote_host=${OCF_RESKEY_CRM_meta_notify_demote_uname}; params: bin=${OCF_RESKEY_bin} client_bin=${OCF_RESKEY_client_bin} config=${OCF_RESKEY_config} user=${OCF_RESKEY_user} rundir=${OCF_RESKEY_rundir} port=${OCF_RESKEY_port}" case "${1:-$__OCF_ACTION}" in diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/zabbixserver new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/zabbixserver --- old/resource-agents-3.9.6+git.1427133197.6897c9c/heartbeat/zabbixserver 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/heartbeat/zabbixserver 2015-04-21 08:11:26.000000000 +0200 @@ -130,26 +130,6 @@ } # -# Check if PID directory exists -# -check_piddir() { - local piddir - local severity - - # lower severity to info during probe - severity=err - ocf_is_probe && severity=info - - piddir=`dirname ${OCF_RESKEY_pid}` - if [ ! -d $piddir ]; then - ocf_log $severity "PID directory ${piddir} doesn't exist" - return 1 - fi - - return 0 -} - -# # Check for the server configuration file # check_config() { @@ -322,9 +302,8 @@ # validate configuration # zabbixserver_validate_all() { - check_piddir || return $OCF_ERR_INSTALLED check_config $OCF_RESKEY_config || return $OCF_ERR_INSTALLED - + ocf_mkstatedir root 755 `dirname $OCF_RESKEY_pid` || return $OCF_ERR_INSTALLED return $OCF_SUCCESS } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/rgmanager/src/resources/orainstance.sh new/resource-agents-3.9.6+git.1429568091.f79322c/rgmanager/src/resources/orainstance.sh --- old/resource-agents-3.9.6+git.1427133197.6897c9c/rgmanager/src/resources/orainstance.sh 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/rgmanager/src/resources/orainstance.sh 2015-04-21 08:11:26.000000000 +0200 @@ -216,7 +216,7 @@ declare -i n=0 ocf_log debug "Waiting for Oracle processes for $ORACLE_SID to terminate..." - while ps ax | grep $ORACLE_SID | grep -v grep | grep -q -v $LSNR_PROCNAME; do + while ps ax | grep ora_.*_${ORACLE_SID} | grep -v grep | grep -q -v $LSNR_PROCNAME; do if [ $n -ge 90 ]; then ocf_log debug "Timed out while waiting for Oracle processes for $ORACLE_SID to terminate" force_cleanup diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/rgmanager/src/resources/postgres-8.metadata new/resource-agents-3.9.6+git.1429568091.f79322c/rgmanager/src/resources/postgres-8.metadata --- old/resource-agents-3.9.6+git.1427133197.6897c9c/rgmanager/src/resources/postgres-8.metadata 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/rgmanager/src/resources/postgres-8.metadata 2015-04-21 08:11:26.000000000 +0200 @@ -51,18 +51,6 @@ <content type="string" default="-D /var/lib/pgsql/data"/> </parameter> - <parameter name="shutdown_wait"> - <longdesc lang="en"> - Wait X seconds for correct end of service shutdown. - This option is ignored in current release. - </longdesc> - <shortdesc lang="en"> - Wait X seconds for correct end of service shutdown - This option is ignored in current release. - </shortdesc> - <content type="integer" /> - </parameter> - <parameter name="startup_wait"> <longdesc lang="en"> Wait X seconds for correct end of service startup diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/rgmanager/src/resources/utils/fs-lib.sh new/resource-agents-3.9.6+git.1429568091.f79322c/rgmanager/src/resources/utils/fs-lib.sh --- old/resource-agents-3.9.6+git.1427133197.6897c9c/rgmanager/src/resources/utils/fs-lib.sh 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/rgmanager/src/resources/utils/fs-lib.sh 2015-04-21 08:11:26.000000000 +0200 @@ -763,7 +763,7 @@ # Agent-specific force unmount logic, if required -# return = nonzero if successful, or 0 if unsuccessful +# return = 0 if successful, or nonzero if unsuccessful # (unsuccessful = try harder) do_force_unmount() { return 1 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/resource-agents-3.9.6+git.1427133197.6897c9c/rgmanager/src/resources/utils/member_util.sh new/resource-agents-3.9.6+git.1429568091.f79322c/rgmanager/src/resources/utils/member_util.sh --- old/resource-agents-3.9.6+git.1427133197.6897c9c/rgmanager/src/resources/utils/member_util.sh 2015-04-07 15:45:11.000000000 +0200 +++ new/resource-agents-3.9.6+git.1429568091.f79322c/rgmanager/src/resources/utils/member_util.sh 2015-04-21 08:11:26.000000000 +0200 @@ -30,6 +30,9 @@ # is_node_member_clustat() { + local node="$1" + local output_list + # Still having a tag while (a) online but (b) not running pacemaker # (e.g. crm_node) or rgmanager not considered adequate for things like # the LVM agent - so we use corosync-quorumtool instead. The function @@ -51,8 +54,19 @@ # 1 1 rhel7-1.priv.redhat.com # 2 1 rhel7-2.priv.redhat.com # - corosync-quorumtool -l | grep -v "^Nodeid" | grep -i " $1\$" &> /dev/null - return $? + + output_list=$(corosync-quorumtool -l | grep -v "^Nodeid") + + # first try searching for the node in the output as both a FQDN or shortname + echo "$output_list" | grep -i -e " $node\$" -e " $node\..*\$" &> /dev/null && return 0 + + # if the node was not found in the quorum list, try any known aliases found in /etc/hosts + for alias in $(cat /etc/hosts | grep -e "\s$node\s" -e "\s$node\$" | tail -n 1 | sed 's/\t/ /g' | cut -f2- -d " "); + do + echo "$output_list" | grep -i -e " $alias\$" &> /dev/null && return 0 + done + + return 1 }
participants (1)
-
root@hilbert.suse.de