commit 0a10aaba1bb4265fc2ce7d98b19938de6279fad6 Author: Martin Magr Date: Fri Oct 9 17:09:26 2020 +0200 Revert "Adapt container health check for built-in podman health checks" This reverts commit 31a1f9c8ed4ad8a212e146cc59e5273048570b32. In train health checks are still scheduled and executed by systemd. So there is no need for adaptation to podman managed health checks. Change-Id: I1e43a1ee5a72afabb0f3ba650c9dd40d0a29d6ac diff --git a/container_config_scripts/monitoring/collectd_check_health.py b/container_config_scripts/monitoring/collectd_check_health.py deleted file mode 100755 index eea75ec..0000000 --- a/container_config_scripts/monitoring/collectd_check_health.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2018 Red Hat Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -import datetime -import re -import sys - -HCLOG = '/var/log/collectd/healthchecks.stdout' -START_RE = re.compile( - r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P[\w\-\.\:]*) systemd\[.*\]: Started /usr/bin/podman healthcheck run (?P\w*)') -EXEC_RE = re.compile( - r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P[\w\-\.\:]*) podman\[(?P\d*)\]: (?P.*) container exec (?P\w*) \(.*name=(?P\w*).*\)') -RESULT_RE = re.compile( - r'(?P\w{3} \d{2} \d{2}\:\d{2}\:\d{2}) (?P[\w\-\.\:]*) podman\[(?P\d*)\]: (?P(un)?healthy)') - - -def process_healthcheck_output(path_to_log): - """Process saved output of health checks and returns list of unhealthy - containers. - """ - data = {} - pid_map = {} - with open(path_to_log, "r+") as logfile: - for line in logfile: - match = START_RE.search(line) - if match: - item = data.setdefault(match.group('container_id'), {}) - item['timestamp_start'] = match.group('timestamp') - item['host'] = match.group('host') - continue - match = EXEC_RE.search(line) - if match: - item = data.setdefault(match.group('container_id'), {}) - item['container_name'] = match.group('container_name') - item['host'] = match.group('host') - item['pid'] = match.group('pid') - pid_map[match.group('pid')] = match.group('container_id') - continue - match = RESULT_RE.search(line) - if match: - if match.group('pid') not in pid_map: - continue - item = data[pid_map[match.group('pid')]] - item['result'] = match.group('result') - if 'timestamp_start' not in item: - continue - try: - start = datetime.datetime.strptime(item['timestamp_start'], - '%b %d %H:%M:%S') - end = datetime.datetime.strptime(match.group('timestamp'), - '%b %d %H:%M:%S') - item['duration'] = (end - start).seconds - except Exception as ex: - err = "[WARN] Failure during calculating duration: {}" - print(err.format(ex)) - continue - logfile.truncate() - - # truncate the file - with open(HCLOG, "w") as logfile: - pass - - unhealthy = [] - for container in data.values(): - if 'result' not in container: - continue - if container['result'] == 'healthy': - continue - log = ('{container_name}: Container health check on host {host} ' - 'results as {result} after {duration}s.') - unhealthy.append(log.format(**container)) - return unhealthy - - -if __name__ == "__main__": - unhealthy = process_healthcheck_output(HCLOG) - if unhealthy: - print(' ; '.join(unhealthy)) - sys.exit(2) diff --git a/deployment/metrics/collectd-container-puppet.yaml b/deployment/metrics/collectd-container-puppet.yaml index 7c11f7b..73099a1 100644 --- a/deployment/metrics/collectd-container-puppet.yaml +++ b/deployment/metrics/collectd-container-puppet.yaml @@ -330,7 +330,18 @@ parameters: default: true CollectdContainerHealthCheckCommand: type: string - default: "/scripts/collectd_check_health.py" + default: | + output="" + while read line ; do + i=$(echo $line | awk '//{gsub(/:/, "", $0); print $5}') + log=$(echo $line | awk '{split($0,a,/:\s+Error:\s+/); print a[2]}') + log=${log:0:-1} + output+=" ; ${i}: ${log}" + done < <(egrep "^[a-zA-Z]{3}\s+[0-9]{2}\s+[0-9\:]{8}\s+.*\s+.*:\s+[Ee]rror\:" /var/log/collectd/healthchecks.log) + truncate -s0 /var/log/collectd/healthchecks.log + if [ ! -z "${output}" ]; then + echo ${output:3} && exit 2; + fi CollectdContainerHealthCheckInterval: type: number description: The frequency in seconds the docker health check is executed. @@ -629,12 +640,6 @@ outputs: - path: /var/log/collectd owner: collectd:collectd recurse: true - container_config_scripts: - map_merge: - - {get_attr: [ContainersCommon, container_config_scripts]} - - collectd_check_health.py: - mode: "0755" - content: { get_file: ../../container_config_scripts/monitoring/collectd_check_health.py } docker_config: step_5: collectd: @@ -656,7 +661,6 @@ outputs: - /var/lib/config-data/puppet-generated/collectd:/var/lib/kolla/config_files/src:ro - /var/log/containers/collectd:/var/log/collectd:rw,z - /var/run/:/var/run:rw - - /var/lib/container-config-scripts:/scripts:ro - /sys/fs/cgroup:/sys/fs/cgroup:ro environment: KOLLA_CONFIG_STRATEGY: COPY_ALWAYS @@ -684,7 +688,7 @@ outputs: copy: dest: /etc/rsyslog.d/openstack-healthcheck.conf content: | - if ($programname startswith 'podman' and ($msg contains 'container exec' or $msg contains 'healthy')) or ($programname startswith 'systemd' and $msg contains 'podman healthcheck run') then -/var/log/containers/collectd/healthchecks.stdout + if $programname startswith 'healthcheck_' then -/var/log/containers/collectd/healthchecks.log & stop - name: Remove healthcheck log when: