#!/bin/ruby
# encoding:utf-8

require 'date'
require 'digest'
require 'fileutils'
require 'json'
require 'logger'
require 'optparse'
require 'rubygems'
require 'socket'
require 'tempfile'
require 'yaml'
require 'open3'

# Ruby 3.2+ removed File.exists?;
class << File
  alias exists? exist? unless singleton_methods.include?(:exists?)
end

class MultiIO
  def initialize(*targets)
    @targets = targets
  end

  def write(*args)
    @targets.each { |t| t.write(*args) }
  end

  def close
    @targets.each(&:close)
  end
end

# tasked with generating/finalizing /etc/grafana/grafana-agent.yaml with
# the Mimir address and creds and other bits
# rubocop:disable Style/NumericLiteralPrefix, Metrics/MethodLength
module GrafanaConfigSeeder

  CONFIG_FILE = '/etc/grafana/grafana-agent.yaml'.freeze

  WPC_SYSTEM_PROPERTIES = '/etc/system.properties'.freeze
  LOG_FILE = '/var/log/grafana-agent-setup.log'.freeze
  GRAFANA_CONF_D = '/etc/grafana/conf.d'.freeze
  BEARER_TOKEN_FILE = '/etc/grafana/.token'.freeze
  BEARER_TOKEN_FILE_GROUP = 'grafana-agent'.freeze

  SYSCONFIG_FILE = '/etc/sysconfig/grafana-agent'.freeze
  # Write ahead logs https://grafana.com/docs/loki/latest/operations/storage/wal
  WAL_DIR_MODE = 0770
  WAL_AGE_DEFAULT = '1h'.freeze
  WAL_TRUNCATE_FREQUENCY = '15m'.freeze

  PROP_ENABLED = 'wd.mon.grafana.agent.enabled'.freeze
  PROP_TOKEN = 'wd.mon.cortex.bearer.token'.freeze
  PROP_ENDPOINT = 'wd.mon.cortex.endpoint'.freeze
  PROP_TENANT = 'wd.mon.cortex.tenant'.freeze
  PROP_LOG_LEVEL = 'wd.mon.grafana.agent.loglevel'.freeze
  PROP_OVERRIDE = 'wd.mon.override.config'.freeze
  PROP_CLIENT_NAME = 'wd.mon.client.name'.freeze
  PROP_CLIENT_IP = 'wd.mon.client.ip'.freeze
  PROP_TLS_CA_FILE = 'wd.mon.client.tls_ca_file'.freeze
  PROP_TLS_VERIFY = 'wd.mon.client.skip_verify'.freeze
  PROP_WAL_AGE = 'wd.mon.client.wal_age'.freeze
  DEFAULT_SCRAPE_INTERVAL = '60s'.freeze
  DEFAULT_SCRAPE_TIMEOUT = '10s'.freeze
  MINIMUM_SCRAPE_INTERVAL = '60s'.freeze
  PROP_NODE_EXPORTER_ENABLED = 'wd.mon.grafana.agent.node.exporter.enabled'.freeze
  PROP_PERF_ENV_LOGICAL = 'wd.mon.env.logical'.freeze
  PROP_NONPERF_ENV_LOGICAL = 'wd.dcs.environment'.freeze
  REQUIRED_LABELS = %w[wd_owner wd_service].freeze

  @logger = Logger.new($stdout)
  @logger.level = Logger::INFO

  def self.logger
    @logger
  end

  class InvalidTimeSpec < StandardError; end
  class InvalidScrapeError < StandardError; end

  def self.generate_sysconfig(props)
    http_address = "#{props.fetch('wd.mon.client.http_address', '127.0.0.1')}:#{props.fetch('wd.mon.client.http_port', '9090')}"
    custom_args = "\"-server.http.address #{http_address} -disable-reporting\""

    cfg = "## updated by grafana_setup.rb ##\n"
    cfg += "CONFIG_FILE=#{CONFIG_FILE}\n"
    cfg += "CUSTOM_ARGS=#{custom_args}\n"
    cfg += "RESTART_ON_UPGRADE=true\n"
    cfg
  end

  def self.generate_grafana_config(props, scrapes, wal_dir, node_exporter_enabled = true, blackbox_modules = {})
    rfc_prefix = 'wd_'
    name_exceptions = %w[env_status tenant customer_type]
    label_exceptions = %w[
      dc_physical
      dc_physical_sub
      dc_provider
      env_logical
      env_physical
      logical
      logical_type
      owner
      platform
      service
      service_instance
      source
      tenant
    ]

    # This is because chef recipe doesnt inject a prefix
    prefix, env_physical = props['gourmet.environment'].split('-')
    wd_env_physical = env_physical.nil? ? prefix : env_physical

    grafana_cfg = {
      'server' => {
        # 'grpc_listen_address' => '127.0.0.1', # set later if in cluster env (chef)
        # 'grpc_listen_port' => 9095, # same as above
        'log_level' => props.fetch(PROP_LOG_LEVEL, 'error')
      },
      'metrics' => {
        'wal_directory' => wal_dir,
        'wal_cleanup_age' => validate_duration(props[PROP_WAL_AGE], WAL_AGE_DEFAULT),
        'http_disable_keepalives' => true,
        'global' => {
          'scrape_interval' => DEFAULT_SCRAPE_INTERVAL,
          'scrape_timeout' => DEFAULT_SCRAPE_TIMEOUT,
          'external_labels' => {
            'ip_address' => props['wd.mon.client.ip'],
            'platform_role' => props['wd.mon.client.role'],
            'source' => props['wd.mon.client.name'],
            'wd_dc_provider' => 'wday',
            'wd_dc_physical' => props['gourmet.data.center'].split('-')[0],
            'wd_env_physical' => wd_env_physical,
            'wd_env_logical' => get_env_logical(props),
            'wd_logical' => get_wd_logical(props),
            'wd_logical_type' => get_wd_logical_type(props),
            'wd_platform' => props.fetch('wd.mon.client.platform', 'wpc')
            # 'wd_service' => nil, # set later, only if in VM context
          }
        },
        'configs' => [
          {
            'name' => 'default',
            'wal_truncate_frequency' => WAL_TRUNCATE_FREQUENCY,
            'max_wal_time' => validate_duration(props[PROP_WAL_AGE], WAL_AGE_DEFAULT),
            'scrape_configs' => scrapes,
            'remote_write' => [
              'url' => props[PROP_ENDPOINT],
              'headers' => {
                'X-Scope-OrgID' => props[PROP_TENANT]
              },
              'tls_config' => {
                # 'ca_file'=> nil, # set later when run by chef
                'insecure_skip_verify' => props.fetch(PROP_TLS_VERIFY, 'true') == 'true' # properties file do not have types, everything is a string
              },
              'authorization' => {
                'type' => 'Bearer',
                'credentials_file' => BEARER_TOKEN_FILE
              },
              # In addition to the series cache, memory usage is number of shards * (capacity + max_samples_per_send)
              # capacity + max_samples_per_send = 3k will constrain shard memory usage to less than 0.5MB. So the following
              # parameters should cause a memory usage of 13.33MB per shard, so 330MB for all shards max
              # source: https://prometheus.io/docs/practices/remote_write/
              'queue_config' => {
                # It is recommended to set capacity to 3-10 times max_samples_per_send
                'capacity' => 60000, # default 2500
                'max_shards' => 25, # default 200
                'min_shards' => 1, # default 1
                # the default of 500 causes a waste of resources in Mimir distributor, batching more samples helps a lot
                'max_samples_per_send' => 20000, # default 500
                'batch_send_deadline' => '20s', # default 5s
                'min_backoff' => '500ms', # default 30ms
                'max_backoff' => '10s', # default 5s
                'retry_on_http_429' => true # default false
              },
              'write_relabel_configs' => [
                { # Ensure all metrics have the following source labels
                  'source_labels' => %w[wd_owner wd_service],
                  'regex' => '(.+);(.+)',
                  'action' => 'keep'
                },
                { # Populate __tmp_keep_me with true if metric name starts with prefix and is in list of exceptions
                  'source_labels' => ['__name__'],
                  'regex' => generate_regex(name_exceptions),
                  'target_label' => '__tmp_keep_me',
                  'replacement' => true
                },
                { # Drop metrics starting with prefix and __tmp_keep_me=true
                  'source_labels' => %w[__name__ __tmp_keep_me],
                  'regex' => "#{rfc_prefix}.*;",
                  'action' => 'drop'
                },
                { # Duplicate labels with name starting with prefix & is in list of exceptions
                  'regex' => generate_regex(label_exceptions),
                  'replacement' => "__tmp_#{rfc_prefix}${1}",
                  'action' => 'labelmap'
                },
                { # Drop the original label from above as replacement in labelmap doesn't actually replace just duplicates
                  'regex' => generate_regex(label_exceptions),
                  'action' => 'labeldrop'
                },
                { # Populate __tmp_delete_me if label starts with prefix. Can't use drop directly as 'source_labels' is a required field for 'drop'
                  'regex' => "#{rfc_prefix}.*",
                  'action' => 'labelmap',
                  'replacement' => '__tmp_delete_me'
                },
                { # Drop all metrics with __tmp_delete_me populated
                  'source_labels' => ['__tmp_delete_me'],
                  'regex' => '(.+)',
                  'action' => 'drop'
                },
                { # Map duplicated labels back to original label key
                  'action' => 'labelmap',
                  'regex' => "__tmp_#{rfc_prefix}(.*)",
                  'replacement' => "#{rfc_prefix}${1}"
                },
                { # Drop the __tmp labels
                  'regex' => '__tmp_.*',
                  'action' => 'labeldrop'
                },
              ] # No need to drop __tmp labels as prometheus does that automatically
            ]
          }
        ]
      },
      'integrations' => {
        'agent' => {
          'enabled' => true
        },
        'node_exporter' => {
          'enabled' => node_exporter_enabled,
          'include_exporter_metrics' => true,
          # if you add at least one module to set_collectors, it will disable all default collectors but the ones specified
          'set_collectors' => ['cpu'],
          # then we enable just the collector we want
          'enable_collectors' => %w[
            bonding conntrack cpu diskstats edac entropy filefd filesystem
            hwmon loadavg meminfo netdev netstat nfs nfsd os sockstat
            stat systemd time uname vmstat
          ]
        },
        'blackbox' => {
          'enabled' => !blackbox_modules.empty?,
          'scrape_integration' => false,
          'blackbox_config' => {
            'modules' => blackbox_modules
          }
        }
      }
    }

    # was only in RPM
    unless props['gourmet.component'].nil?
      grafana_cfg['metrics']['global']['external_labels']['wd_service'] = props['gourmet.component']
    end

    # were only in chef
    unless props[PROP_TLS_CA_FILE].nil?
      grafana_cfg['metrics']['configs'][0]['remote_write'][0]['tls_config']['ca_file'] = props[PROP_TLS_CA_FILE]
    end
    unless props['wd.mon.client.grpc_address'].nil?
      grafana_cfg['server']['grpc_listen_address'] = props['wd.mon.client.grpc_address']
    end
    unless props['wd.mon.client.grpc_port'].nil?
      grafana_cfg['server']['grpc_listen_port'] = props['wd.mon.client.grpc_port'].to_i
    end

    config_override(props, grafana_cfg)

    grafana_cfg
  end

  def self.validate_duration(duration, default_duration)
    return default_duration if duration.nil?
    return default_duration unless duration[/^[0-9]+(ns|us|µs|ms|[smh])$/]

    duration
  end

  # creates regex of the following pattern "wd_(exception1|exception2|exception3)"
  def self.generate_regex(exceptions, prefix = 'wd_')
    "#{prefix}(#{exceptions.join('|')})"
  end

  # Returns a lowercase string for env_logical(e.g. wd3-perf) or nil.
  # For perf instances, wd.mon.env.logical will exist and be used. See https://jira2.workday.com/browse/DPOE-24142
  # For all other instances, wd.dcs.environment will be used.
  def self.get_env_logical(props)
    env_logical = lowercase_or_nil(props[PROP_PERF_ENV_LOGICAL])
    env_logical = lowercase_or_nil(props[PROP_NONPERF_ENV_LOGICAL]) if env_logical.nil?
    env_logical = nil if env_logical == ''
    env_logical
  end

  # Returns a lowercase string for wd_logical or nil, following this order:
  # (1) Use the prefix of 'wd.mon.client.logical'
  # (2) Use the prefix of 'wd.mon.env.logical'
  # Return empty string if it is 'perf'. See https://jira2.workday.com/browse/DPOE-24142
  # Otherwise, return the prefix of 'wd.mon.env.logical'
  # (3) Use the prefix of 'wd.dcs.environment'
  # (4) Otherwise we give up and return nil
  def self.get_wd_logical(props)
    wd_logical = lowercase_or_nil(props['wd.mon.client.logical'])

    if wd_logical.nil?
      begin
        env_logical = get_env_logical(props)
        wd_logical = ''
        if env_logical != 'perf'
          wd_logical = env_logical.nil? ? nil : env_logical.split('-')[0].downcase
        end
      rescue => e
        @logger.warn "Error getting wd_logical: #{e}"
        return nil
      end
    end
    wd_logical
  end

  # Returns the env_logical string without prefix
  # If there is only the prefix, we return an empty string
  # If it is just 'perf', we return 'perf'.
  # Otherwise we return nil
  def self.get_wd_logical_type(props)
    env_logical = get_env_logical(props)
    logical_type = env_logical
    if env_logical != 'perf'
      begin
        logical_type = env_logical.nil? ? nil : env_logical.split('-').drop(1).join('-')
      rescue => e
        @logger.warn "Error getting wd_logical_type: #{e}"
        return nil
      end
    end

    logical_type
  end

  def self.lowercase_or_nil(value)
    return nil if value.nil?
    value == '' ? nil : value.downcase
  end

  def self.config_override(props, grafana_cfg)
    return unless props.key?(PROP_OVERRIDE)

    override = JSON.parse props[PROP_OVERRIDE]
    deep_merge(grafana_cfg, override)
  rescue => e
    @logger.warn "Error applying config override: #{e}"
  end

  def self.in_secs(timespec)
    multiplier = {
      's' => 1,
      'm' => 60,
      'h' => 60 * 60
    }
    time, spec = timespec.to_s.downcase.strip.scan(/^(\d+)([smh])?$/).first

    msg = "timespec \"#{timespec}\" is invalid"
    raise InvalidTimeSpec, msg  if time.to_s.empty?

    spec.nil? ? timespec.to_i : time.to_i * multiplier[spec]
  end

  def self.validate_integration_config(cfg)
    # doing some *light* validation of the integration configuration.
    # Raise an exception if validation fails, it will be caught and logged in load_scrape_configs
    raise InvalidScrapeError, "Invalid integration configuration: #{cfg.to_json}" unless cfg.key?('integrations')
  end

  def self.validate_scrape_config(cfg)
    # we may want to do some validation, according to the docs:
    # https://prometheus.io/docs/prometheus/2.27/configuration/configuration/#scrape_config
    # raise an exception if validation fails, it will be caught and logged in load_scrape_configs

    # validate we have job_name and metrics_path
    raise InvalidScrapeError, "Missing job name: #{cfg.to_json}" unless cfg.key?('job_name')
    raise InvalidScrapeError, "Missing metrics path: #{cfg.to_json}" unless cfg.key?('metrics_path')

    # validate we have a static configuration and a target
    raise InvalidScrapeError, "Missing static configs: #{cfg.to_json}" unless cfg.key?('static_configs')
    raise InvalidScrapeError, "Invalid static configs, should be non empty array: #{cfg.to_json}" unless cfg['static_configs'].is_a?(Array) && cfg['static_configs'].size > 0
    raise InvalidScrapeError, "Missing target: #{cfg.to_json}" unless cfg['static_configs'][0].key?('targets')
    raise InvalidScrapeError, "Invalid targets, should be non empty array: #{cfg.to_json}" unless cfg['static_configs'][0]['targets'].is_a?(Array) && cfg['static_configs'][0]['targets'].size > 0

    # we need to verify we have the standard labels
    raise InvalidScrapeError, "Missing labels: #{cfg.to_json}" unless cfg['static_configs'][0].key?('labels') and cfg['static_configs'][0]['labels'].is_a?(Hash)

    REQUIRED_LABELS.each do |lbl|
      raise InvalidScrapeError, "Missing mandatory label '#{lbl}': #{cfg.to_json}" unless cfg['static_configs'][0]['labels'].key?(lbl)
    end

    timeout = cfg.fetch('scrape_timeout', DEFAULT_SCRAPE_TIMEOUT)
    interval = cfg.fetch('scrape_interval', DEFAULT_SCRAPE_INTERVAL)

    msg =  "scrape timeout (#{timeout}) >= interval (#{interval}): #{cfg.to_json}\n"
    msg += "If timeout is not specified in config the default timeout of #{DEFAULT_SCRAPE_TIMEOUT} is used\n"
    msg += "If interval is not specified in config the default interval of #{DEFAULT_SCRAPE_INTERVAL} is used\n"
    raise InvalidScrapeError, msg.strip if in_secs(timeout) >= in_secs(interval)

    raise InvalidScrapeError, "scrape_interval must be <= #{WAL_TRUNCATE_FREQUENCY}" if in_secs(interval) > in_secs(WAL_TRUNCATE_FREQUENCY)
    cfg
  end

  def self.validated_by_agent(grafana_cfg, scrape_name)
    file = Tempfile.new('grafana_candidate_config.yaml')
    begin
      file.write(grafana_cfg.to_yaml)
      file.close
      validation_command = "grafana-agentctl config-check #{file.path}"
      `#{validation_command}`
      if $?.exitstatus.zero?
        @logger.info("#{scrape_name} validated with agent")
        return true
      else
        @logger.warn("#{scrape_name} FAILED validation with agent")
        return false
      end
    rescue Exception => err
      @logger.warn("Calling agent validation for #{scrape_name} raised an exception")
      @logger.warn("Error - #{err}")
      return false
    ensure
      file.unlink
    end
  end

  def self.wal_permissions_updated(wal_dir)
    updated = false
    unless File.stat(wal_dir).mode & WAL_DIR_MODE == WAL_DIR_MODE
      File.chmod(WAL_DIR_MODE, wal_dir)
      updated = true
    end
    updated
  end

  def self.deep_merge(a, b)
    b.keys.each do |k|
      if !a.key?(k)
        a[k] = b[k]
      elsif a[k].is_a?(Hash) && b[k].is_a?(Hash)
        deep_merge(a[k], b[k])
      elsif a[k].is_a?(Array) && b[k].is_a?(Array)
        deep_merge_array(a[k], b[k])
      else
        a[k] = b[k]
      end
    end
  end

  def self.deep_merge_array(a, b)
    (0..[a.size, b.size].min - 1).each do |i|
      if a[i].is_a?(Hash) && b[i].is_a?(Hash)
        deep_merge(a[i], b[i])
      elsif a[i].is_a?(Array) && b[i].is_a?(Array)
        deep_merge_array(a[i], b[i])
      else
        a[i] = b[i]
      end
    end
  end

  # moved logic of grafana-agent.yaml to following code.
  def self.grafana_agent_scrape_config(props)
    regex = get_grafana_agent_metrics_to_keep().join('|').chomp

    {
      'job_name' => 'grafana_agent',
      'metrics_path' => '/metrics',
      'scrape_interval' => '180s',
      'scrape_timeout' => '30s',
      'add_source' => 'true',
      'static_configs' => [
        'targets' => ["#{props.fetch('wd.mon.client.http_address', '127.0.0.1')}:#{props.fetch('wd.mon.client.http_port', '9090')}"],
        'labels' => {
          'wd_owner' => 'mia',
          'wd_service' => 'pharos-grafana-agent'
        }
      ],
      'metric_relabel_configs' => [
        { 'source_labels' => ['__name__'],
          'regex' => regex,
          'action' => 'keep'},
      ]
    }
  end

  def self.get_grafana_agent_metrics_to_keep()
    "agent_build_info
agent_tcp_connections
agent_wal_samples_appended_total
agent_wal_storage_active_series
go_gc_duration_seconds_count
go_goroutines
go_memstats_heap_inuse_bytes
go_memstats_heap_inuse_bytes
process_cpu_seconds_total
prometheus_remote_storage_enqueue_retries_total
prometheus_remote_storage_highest_timestamp_in_seconds
prometheus_remote_storage_queue_highest_sent_timestamp_seconds
prometheus_remote_storage_samples_dropped_total
prometheus_remote_storage_samples_failed_total
prometheus_remote_storage_samples_pending
prometheus_remote_storage_samples_retried_total
prometheus_remote_storage_samples_total
prometheus_remote_storage_sent_batch_duration_seconds_bucket
prometheus_remote_storage_sent_batch_duration_seconds_count
prometheus_remote_storage_sent_batch_duration_seconds_sum
prometheus_remote_storage_shard_capacity
prometheus_remote_storage_shards
prometheus_remote_storage_shards_desired
prometheus_remote_storage_shards_max
prometheus_remote_storage_shards_min
prometheus_remote_storage_succeeded_samples_total
prometheus_sd_discovered_targets
prometheus_target_interval_length_seconds_sum
prometheus_target_scrapes_exceeded_sample_limit_total
prometheus_target_scrapes_sample_duplicate_timestamp_total
prometheus_target_scrapes_sample_out_of_bounds_total
prometheus_target_scrapes_sample_out_of_order_total
prometheus_target_sync_length_seconds_sum
prometheus_wal_watcher_current_segment".split("\n").delete_if(&:empty?)
  end

  def self.get_node_exporter_metrics_to_drop()
    "agent_request_duration_seconds_bucket
agent_request_message_bytes_bucket
agent_response_message_bytes_bucket
node_cpu_package_throttles_total
node_disk_ata_rotation_rate_rpm
node_disk_ata_write_cache
node_disk_device_mapper_info
node_entropy_pool_size_bits
node_exporter_build_info
node_hwmon_chip_names
node_hwmon_power_average_interval_max_seconds
node_hwmon_power_average_interval_min_seconds
node_hwmon_power_average_interval_seconds
node_hwmon_power_average_watt
node_hwmon_power_is_battery_watt
node_hwmon_sensor_label
node_hwmon_temp_highest_celsius
node_nfs_connections_total
node_nfs_packets_total
node_nfs_requests_total
node_nfs_rpc_authentication_refreshes_total
node_nfs_rpc_retransmissions_total
node_nfs_rpcs_total
node_time_clocksource_available_info
node_time_clocksource_current_info
node_time_zone_offset_seconds
prometheus_sd_kubernetes_events_total
prometheus_target_sync_length_seconds
promhttp_metric_handler_errors_total
promhttp_metric_handler_requests_in_flight".split("\n").delete_if(&:empty?)
  end

  # removes keys with empty values, ruby 2.3.1 is missing hash.compact method
  def self.compact(hash)
    result = {}
    hash.each do |key, value|
      result[key]=value unless value.nil?
    end
    result
  end

  # The Grafana Agent (GA) requires a restart to pick up config changes.
  # We create a MD5 of the GA config to make it easy to compare.
  # (!) This method does not check if GA is enabled or disabled before running as it can be (re-)started here.
  # If a GA restart is required set restart_required to true
  # If (systemd) service manipulation is not wanted (i.e. for debugging,...) set service_change_enabled to false
  def self.grafana_agent_refresh(restart_required, service_change_enabled)
    @logger.info 'Starting Grafana Agent refresh'
    @logger.info "Grafana Agent service status changes are #{service_change_enabled ? 'enabled' : 'disabled'}"

    # There is the possibility that GA is not running, but should be running.
    # We don't bother to check if it is not running before triggering the start.
    # The action of checking if the agent is running takes the same time as the no-op
    # of trying to start a running service.
    @logger.info 'Proactively starting Grafana Agent (no-op if already running)'
    grafana_agent_service('start') if service_change_enabled

    config_md5_location = "#{CONFIG_FILE}.md5"

    current_md5 = calculate_md5(CONFIG_FILE)

    # check if the service was started after any config changes and the calculated MD5 matches the file
    if md5_matches?(CONFIG_FILE, config_md5_location, current_md5) && grafana_agent_restarted_since_cfg_update(CONFIG_FILE)
      @logger.info 'Grafana Agent config md5 matches and config is up to date'

      if restart_required
        @logger.info 'Changes outside of Grafana Agent config require a Grafana Agent restart'
        grafana_agent_service('restart') if service_change_enabled
      end

      return
    end

    # We check if the service is older than the config md5.
    # Writing the md5 first keeps the service younger than the md5
    # in case we have to restart the service too.
    write_file(config_md5_location, current_md5, 'md5')
    grafana_agent_service('restart') if service_change_enabled
    @logger.info 'Grafana Agent refresh is completed'
  end

  def self.grafana_agent_restarted_since_cfg_update(cfg)
    begin
      md5_mod_datetime = File.mtime(cfg)
      md5_mod_datetime_epoch = DateTime.parse(md5_mod_datetime.to_s).to_time.to_i
    rescue Errno::ENOENT => e
      @logger.warn "Could not read file modification date of #{cfg}, because of: #{e}"
      return false # to keep the lights on
    rescue ArgumentError => e
      @logger.error "Couldn't convert Grafana Agent modification time #{md5_mod_datetime} of #{cfg} because of #{e}."
      return false # to keep the lights on
    end

    ga_service_start_epoch = service_start_epoch

    if ga_service_start_epoch == -1
      # well this did not turn up as we expected...
      # refresh EVERYTHING
      # https://giphy.com/gifs/sUNqplVFtsctW
      @logger.error 'Our host CentOS version check keeps failing on this host.'
      return false
    end

    ga_service_start_epoch > md5_mod_datetime_epoch
  end

  def self.calculate_md5(config_location)
    Digest::MD5.file(config_location).hexdigest
  rescue Errno::ENOENT => e
    @logger.warn "MD5 of #{config_location} could not be created, because of: #{e}"
    ''
  end

  def self.md5_matches?(config_location, config_md5_location, current_md5)
    File.read(config_md5_location) == current_md5
  rescue Errno::ENOENT => e
    @logger.warn "MD5 of #{config_location} and #{config_md5_location} did not match, because of: #{e}"
    false
  end

  def self.service_start_epoch
    @logger.info 'Query Grafana Agent service systemd start datetime.'

    service_starttime_cmd = 'systemctl show --property=ActiveEnterTimestamp grafana-agent | cut -d= -f2'
    service_starttime = -1

    begin
      service_starttime = `#{service_starttime_cmd}`
    rescue Errno::ENOENT => e
      @logger.error "Couldn't get Grafana Agent systemd start datetime via #{service_starttime_cmd} because of #{e}."
      return service_starttime
    end

    # systemd will report an empty string for the start datetime in case the service is not running or has not run yet # TODO something weird is going on here during automated testing. The tests are getting no activeentertimestamp, but manual tseting is getting one.
    if service_starttime.strip.empty? # we strip to get rid of the newline character
      @logger.warn 'Systemd shows that Grafana Agent was not started.'
      return 0 # if we return -1 we signal failure
    end

    service_start_epoch = -1

    begin
      service_start_epoch_parsed = DateTime.parse(service_starttime)
      service_start_epoch = service_start_epoch_parsed.to_time.to_i
    rescue Errno::ENOENT, ArgumentError => e
      @logger.error "Couldn't convert Grafana Agent systemd start datetime #{service_starttime} because of #{e}."
      @logger.error "#{`systemctl show grafana-agent | grep Active`}"
    end

    service_start_epoch
  end

  def self.grafana_agent_service(state_change)
    @logger.info "Trying to #{state_change} the Grafana Agent."

    state_change_cmd = ''

    case state_change
    when 'start'
      state_change_cmd = 'systemctl start grafana-agent'
    when 'stop'
      state_change_cmd = 'systemctl stop grafana-agent'
    when 'restart'
      state_change_cmd = 'systemctl restart grafana-agent'
    else
      @logger.error "We can can start, stop and restart the Grafana Agent, but we don't know #{state_change}."
      return
    end

    @logger.info "Trying to run #{state_change_cmd}."

    begin
      `#{state_change_cmd}`
    rescue Errno::ENOENT, Errno::EACCES => e
      @logger.error "Could not change the Grafana Agent service via #{state_change_cmd}, because of #{e}."
    end
  end

  # moved logic of node_exporter.yml to following code
  def self.node_exporter_script_exporter_config(props)
    if props.fetch('wd.mon.node.exporter.wpc.labels.enabled', 'false') == 'true'
      @logger.info 'Enabling WPC labels on node exporter metrics'
      wpc_labels = {
        'base_image_version' => props.fetch('gourmet.base.image.version', nil),
        'base_os' => props.fetch('gourmet.base.os', nil),
        'dc' => props.fetch('gourmet.data.center', nil),
        'flavor' => props.fetch('gourmet.flavor', nil),
        'gourmet_environment' => props.fetch('gourmet.environment', nil),
        'pds_class' => props.fetch('gourmet.class', nil),
        'wpc_version' => props.fetch('gourmet.wpc.version', nil)
      }
      wpc_labels = self.compact(wpc_labels)
    else
      wpc_labels = {}
    end

    extra_labels_val = props.fetch('wd.mon.node.exporter.extra.labels', nil)
    if extra_labels_val.nil?
      extra_labels = {}
    else
      @logger.info 'Adding extra custom labels on node exporter metrics'
      extra_labels = JSON.parse(extra_labels_val)
    end

    mia_labels = {
      'wd_owner' => 'mia',
      'wd_service' => 'pharos-grafana-agent'
    }
    # highest priority to mia_labels
    labels = wpc_labels.merge(extra_labels).merge(mia_labels)
    regex =  get_node_exporter_metrics_to_drop().join('|').chomp
    {
      'name' => 'node_exporter',
      'command' => "/etc/plugins/pharos/prometheus_metrics_aggregator_for_node_exporter.rb -E \"node_systemd_\" -l 10000 --host #{props.fetch('wd.mon.client.http_address', '127.0.0.1')} --port #{props.fetch('wd.mon.client.http_port', '9090')}",
      'scrape_interval' => '180s',
      'scrape_timeout' => '60s',
      'add_source' => true,
      'labels' => labels,
      'metric_relabel_configs' => [
        { 'source_labels' => ['__name__'],
          'regex' => regex,
          'action' => 'drop'},
      ]
    }
  end

  def self.load_blackbox_modules(conf_d_folder)
    blackbox_folder = "#{conf_d_folder}/integrations/blackbox"

    unless Dir.exist? blackbox_folder
      @logger.warn "Missing blackbox integration directory: #{blackbox_folder}"
      return {}
    end

    modules = {}
    Dir["#{blackbox_folder}/*.yaml", "#{blackbox_folder}/*.yml"]
      .each do |f|
      @logger.info "Loading #{f} as blackbox module configuration"
      begin
        cfg = YAML.safe_load File.read f
        conflicting_keys = cfg.keys & modules.keys
        raise InvalidScrapeError, "Conflicting blackbox module defined in #{f}: #{conflicting_keys}" unless conflicting_keys.empty?

        modules = modules.merge(cfg)
      rescue => error
        @logger.warn "Error loading blackbox module file #{f}: #{error}\n#{error.backtrace.join("\n")}"
      end
    end
    modules
  end

  def self.load_scrape_configs(props, scrape_folder)

    unless Dir.exist? scrape_folder
      @logger.warn "Missing configuration directory: #{scrape_folder}"
      return []
    end

    std_labels = {
      'add_source' => { 'source' => props[PROP_CLIENT_NAME], 'ip_address' => props[PROP_CLIENT_IP] }
    }

    configs = []
    Dir["#{scrape_folder}/**/*.yaml", "#{scrape_folder}/**/*.yml"]
      .reject { |file_path| file_path.include?("#{scrape_folder}/integrations/") }
      .each do |f|
      @logger.info "Loading #{f}"
      begin
        cfg = validate_scrape_config YAML.load File.read f
        std_labels.each do |add_lbl, new_lbl|
          if cfg[add_lbl] || false
            cfg['static_configs'][0]['labels'] = {} unless cfg['static_configs'][0].key?('labels')
            cfg['static_configs'][0]['labels'].merge! new_lbl
          end
          cfg.delete(add_lbl)
        end
        interval = cfg.fetch('scrape_interval', DEFAULT_SCRAPE_INTERVAL)
        if in_secs(interval) < in_secs(MINIMUM_SCRAPE_INTERVAL)
          cfg['scrape_interval'] = MINIMUM_SCRAPE_INTERVAL
          @logger.info "Overwriting scrape interval defined in #{f}, as the value was too low. Scrape interval must be at least #{MINIMUM_SCRAPE_INTERVAL}"
        end
        scrape_override(props, cfg)
        configs.push validate_scrape_config cfg
      rescue => error
        @logger.warn "Error loading file #{f}: #{error}\n#{error.backtrace.join("\n")}"
      end
    end

    grafana_scrape_cfg = add_labels_grafana_scrape_config(grafana_agent_scrape_config(props), std_labels)
    scrape_override(props, grafana_scrape_cfg)
    configs << grafana_scrape_cfg
  end

  def self.add_labels_grafana_scrape_config(grafana_scrape_cfg, std_labels)
    std_labels.each do |add_lbl, new_lbl|
      if grafana_scrape_cfg[add_lbl] || false
        grafana_scrape_cfg['static_configs'][0]['labels'] = {} unless grafana_scrape_cfg['static_configs'][0].key?('labels')
        grafana_scrape_cfg['static_configs'][0]['labels'].merge! new_lbl
      end
      grafana_scrape_cfg.delete(add_lbl)
    end
    return grafana_scrape_cfg
  end

  def self.scrape_override(props, cfg)
    # gourmet replace underscores with dots
    if cfg['job_name'] != nil
      job_name = cfg['job_name'].gsub('_', '.')
    else
      job_name = cfg['name'].gsub('_', '.')
    end
    override_name = "wd.mon.override.scrape.#{job_name}"
    if props.key?(override_name)
      @logger.info "Applying override #{override_name}"
      begin
        override = JSON.parse(props[override_name])
        deep_merge(cfg, override)
      rescue => e
        # ignore override if it's incorrect
        @logger.warn "Error applying override #{override_name}: #{e}"
      end
    end
  end

  # check if config changed after scrape configs have been considered
  # and overrides have been applied
  # @return [Bool]
  def self.config_changed(file_path, cfg)
    begin
      old_cfg = YAML.load File.read file_path
      @logger.info "File #{file_path} changed: #{old_cfg != cfg}"
      old_cfg != cfg
    rescue Exception => e
      @logger.warn "Error reading #{file_path}: #{e}. File will be overwritten."
      true
    end
  end

  def self.write_file(path, content, content_type)
    @logger.info "Trying to write #{content_type} to #{path}."
    begin
      File.open(path, 'w') do |file|
        file.write(content)
        file.chmod(0644)
      end
      @logger.info "Successfully wrote #{content_type} to #{path}."
    rescue Exception => e
      @logger.error "Could not write #{content_type} to #{file}: #{e}."
    end
  end

  def self.sysconfig_changed(sysconfig)
    begin
      File.read(SYSCONFIG_FILE) != sysconfig
    rescue Exception => e
      @logger.warn "Could not read system config file #{sysconfig}: #{e}. File will be overwritten"
      true
    end
  end

  def self.correct_wal_dir_owner?(directory)
      # returns true if the WAL directory has the correct owner, otherwise false
    @logger.info("Checking ownership for wal directory: #{directory}")

    begin
      incorrect_owner_dirs = `find /data/grafana-agent ! -user grafana-agent -type d -ls`
    rescue Errno::ENOENT => e # catch cmd issues
      @logger.error("Cannot check ownership for wal directory #{directory}. Will set ownership to make sure.")
      return false
    end

    @logger.info("Ownership check for wal directory #{directory} showed #{incorrect_owner_dirs}.")
    return incorrect_owner_dirs == ''
  end

  def self.wal_directory(directory)
    if File.directory?(directory)
      return directory
    else
      return '/var/lib/grafana-agent/'
    end
  end

  def self.main

    @logger = Logger.new MultiIO.new($stdout, File.open(LOG_FILE, 'a'))
    @logger.level = Logger::INFO

    options = {}
    OptionParser.new do |opts|
      opts.banner = 'Usage: grafana_setup.rb [options]'

      opts.on('-c', '--check-enabled', 'Check if grafana agent is enabled on this host') do |e|
        options[:check_enabled] = e
      end

      options[:enable_service_changes] = true
      opts.on('-e', '--enable-service-changes [BOOLEAN]', TrueClass,
              'Allow changing the Grafana Agent service status, defaults to true.') do |e|
        options[:enable_service_changes] = e.nil? ? true : e
      end

      options[:output_config] = CONFIG_FILE
      opts.on('-o', '--output FILE', "output resulting grafana config file, defaults to #{CONFIG_FILE}") do |file|
        options[:output_config] = file
      end

      options[:properties_file] = WPC_SYSTEM_PROPERTIES
      opts.on('-p', '--properties-file FILE', "Custom system properties file, defaults to #{WPC_SYSTEM_PROPERTIES}") do |file|
        options[:properties_file] = file
      end

      options[:scrape_folder] = GRAFANA_CONF_D
      opts.on('-s', '--scrape-folder FOLDER',
              "directory containing extra scrapes yaml config, defaults to #{GRAFANA_CONF_D}.") do |folder|
        options[:scrape_folder] = folder
      end

      opts.on('-h', '--help', 'Display this screen') do
        puts opts
        exit 0
      end

    end.parse!

    # This script gained the feature to restart the GA on its own.
    # Since this might be undesirable in some situation, i.e. debugging,
    # bug in the setup script,... We have a simplistic guard to stop any
    # service manipulation by the script by creating a file in /etc/grafana:
    # /etc/grafana/stop_auto_service_changes
    if options[:enable_service_changes]
      options[:enable_service_changes] = !File.exist?('/etc/grafana/stop_auto_service_changes')
      @logger.info("Grafana Agent service changes are enabled: #{options[:enable_service_changes]}.")
    end

    @logger.info("Loading overrides #{options[:properties_file]}")
    props = SystemProps.load_config_properties(options[:properties_file])
    exit 2 unless SystemProps.all_mandatory_keys_are_present(props)

    File.write(BEARER_TOKEN_FILE, props[PROP_TOKEN])
    FileUtils.chown(nil, BEARER_TOKEN_FILE_GROUP, BEARER_TOKEN_FILE)

    # mkdir directory /data/grafana-agent if /data directory exists on hosts
    if File.directory?('/data/')
      # assumed to run only once during the first setup
      unless File.directory?('/data/grafana-agent')
        FileUtils.mkpath('/data/grafana-agent')
        FileUtils.chown('grafana-agent', 'grafana-agent', '/data/grafana-agent')
        FileUtils.chmod(0770, '/data/grafana-agent')
      end

      # ensure the owner and mode of the WAL directory
      if File.directory?('/data/grafana-agent') && !correct_wal_dir_owner?('/data/grafana-agent')
        @logger.info('Re-applying owner and mode for /data/grafana-agent')
        FileUtils.chown_R('grafana-agent', 'grafana-agent', '/data/grafana-agent')
        FileUtils.chmod_R(0770, '/data/grafana-agent')
      end
    else
      @logger.error('Cannot access /data to create the WAL directory')
    end

    wal_dir = self.wal_directory('/data/grafana-agent/').freeze # TODO check when we cen remove this

    # overrides for testing in c7_base hosts
    if props['gourmet.component'] == 'c7_base'
      props[PROP_ENABLED] = 'true'
      props[PROP_ENDPOINT] = 'https://nowhere.io/'
    end

    enabled = props.fetch(PROP_ENABLED, 'false') == 'true'
    if options[:check_enabled]
      # only check if grafana agent is enabled and exit
      exit enabled ? 0 : 1
    end
    unless enabled
      @logger.info 'Grafana agent is not enabled, attempting to stop the agent service.'

      grafana_agent_service('stop') if options[:enable_service_changes]

      exit
    end

    # write node exporter yaml to conf.d if it doesn't exist / update when changes are detected
    node_exporter_enabled = props.fetch(PROP_NODE_EXPORTER_ENABLED, 'true') == 'true'
    node_exporter_file = '/etc/pharos_script_exporter/conf.d/node_exporter.yaml'

    begin
      if node_exporter_enabled
        node_exporter = node_exporter_script_exporter_config(props)
        scrape_override(props, node_exporter)

        if !File.exist?(node_exporter_file)
          @logger.info "Creating #{node_exporter_file}"
          File.open(node_exporter_file, 'w') { |file| file.write(node_exporter.to_yaml) }
        elsif YAML.load_file(node_exporter_file) != node_exporter then
          @logger.info "Updating contents of #{node_exporter_file}"
          File.open(node_exporter_file, 'w') { |file| file.write(node_exporter.to_yaml) }
        else
          @logger.info "No changes in #{node_exporter_file}"
        end
      else
        File.delete(node_exporter_file) if File.exist?(node_exporter_file)
      end
    rescue IOError => e
      file.close unless file.nil?
    end

    # builds conf one scrape at a time, and validate against agent service
    scrape_configs = load_scrape_configs(props, options[:scrape_folder])

    validated_scrapes = []
    scrape_configs.each do |scrape_config|
      grafana_cfg = generate_grafana_config(props, [scrape_config], wal_dir, node_exporter_enabled, {})
      validated_scrapes.push scrape_config if validated_by_agent(grafana_cfg, scrape_config['job_name'])
      validated_scrapes.sort_by! { |scrape| scrape['job_name'] }
    end

    ga_restart_required = false # for changes that are not reflected in the config and require a GA restart

    blackbox_modules = load_blackbox_modules(options[:scrape_folder])
    grafana_cfg_with_blackbox_modules = generate_grafana_config(props, validated_scrapes, wal_dir, node_exporter_enabled, blackbox_modules)

    if validated_by_agent(grafana_cfg_with_blackbox_modules, 'all scrapes and blackbox modules')
      grafana_cfg = grafana_cfg_with_blackbox_modules
    else
      @logger.warn 'Grafana agent configuration with blackbox integration not valid, trying without'
      grafana_cfg = generate_grafana_config(props, validated_scrapes, wal_dir, node_exporter_enabled, {})
    end

    if config_changed(options[:output_config], grafana_cfg) && validated_by_agent(grafana_cfg, 'All scrapes together')
      write_file(options[:output_config], grafana_cfg.to_yaml, 'config')
      ga_restart_required = true
      @logger.info "Grafana config #{options[:output_config]} changed, exit 0 should restart agent"
    end

    sysconfig = generate_sysconfig(props)
    if sysconfig_changed(sysconfig)
      write_file(SYSCONFIG_FILE, sysconfig, 'config')
      ga_restart_required = true
      @logger.info 'Grafana sysconfig changed, exit 0 should restart agent'
    end

    if wal_permissions_updated(wal_dir)
      ga_restart_required = true
      @logger.info 'Grafana WAL permissions changed, exit 0 should restart agent'
    end

    # there might have been changes to the config requiring an agent restart and config md5 update
    grafana_agent_refresh(ga_restart_required, options[:enable_service_changes])

    exit
  rescue => e
    @logger.error "Unhandled exception: #{e}"
    @logger.error "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
    exit 2
  end

  # retrieves the properties we need from PDS/Gourmet.
  class SystemProps
    MANDATORY_PROP_KEYS = [
      PROP_TOKEN,
      PROP_ENDPOINT,
      PROP_TENANT
    ].freeze

    @logger = Logger.new(STDOUT)
    @logger.level = Logger::INFO

    def self.load_config_properties(props_file)
      props = {}
      # Comment lines in .properties files are denoted by
      # the number sign (#) or the exclamation mark (!) as the first non blank character
      begin
        File.readlines(props_file)
            .reject { |line| line.lstrip[/^[#!]/] }
            .select { |line| line[/=/] }
            .each do |line|
          key, val = line.split('=', 2)
          props[key.strip] = val.strip
        end
      rescue Exception => e
        @logger.error "Unable to read system properties file #{props_file}: #{e}."
      end

      begin
      # add a couple more relative to the node itself
        props[PROP_CLIENT_NAME] = get_hostname
        route_table = parse_route_table
        interface = get_interface(route_table)
        props[PROP_CLIENT_IP] = get_ip_of_interface(interface) != nil ? get_ip_of_interface(interface) : get_first_ip_address
      rescue => e
        @logger.error "Unable to read set properties file : #{e}."
      end
      props
    end

    def self.get_first_ip_address
      Socket.ip_address_list.select { |i| !i.ipv4_loopback? && i.ipv4? }.map(&:ip_address).first
    rescue => error
      @logger.warn "Could not get the first IP of host. Error: #{error}"
      return 'N/A'
    end

    def self.get_ip_of_interface(interface)
      begin
        # Socket.getifaddrs does not exist in ruby v2.0.0. Our tests
        # run ruby but it is unclear if we use really use 2.0.0 on any
        # production hosts now that CentOs 6 is deprecated.
        Socket.getifaddrs.map.select {
          |ifaddr| ifaddr.name == interface
        }.map.select {|ifaddr| ifaddr.addr.ip?}.map { # return only elements with IP address
          |ifaddr| ifaddr.addr}.map {
          |addrinfo| addrinfo.ip_address # get the array of addresses
        }.first # get the first (only) element of the array
      rescue => error
        @logger.warn "Could not get the public IP of host. Error: #{error}"
        nil
      end
    end

    def self.parse_route_table
      headers = nil
      result = {}
      command = "/usr/bin/netstat -nr"
      begin
        output, status = Open3.capture2(command)
      rescue => error
        @logger.warn "Error: #{error}, when running #{command}. Unable to parse route table"
        return result
      end

      if status.success? && output
        begin
          output.each_line.with_index do |line, index|
            line = line.split
            next if index == 0 && line[0] != 'Destination'

            if line[0] == 'Destination'
              headers = line
              next
            end

            line_hash = {}
            headers.each.with_index do | header, index |
                line_hash[header] = line[index]
            end
            result[line[0]] = line_hash
          end
        rescue => error
          @logger.warn "Could not parse the Route Table. Error: #{error}"
        end
      else
        @logger.warn "Could not parse the Route Table."
      end

      result
    end

    def self.get_interface(route_table) # All CFNs use fabric0, if fabric0 exists that will be the public interface otherwise, we want the interface from the default route.
      fabric_entry = route_table.values.find { |entry| entry['Iface'] =~ /^fabric\d$/ }

      if fabric_entry
        fabric_entry['Iface']
      else
        route_table.fetch('0.0.0.0', {}).fetch('Iface','bond0') # if all else fails, return bond0
      end
    end

    def self.get_hostname
      `hostname -f`.rstrip
    end

    def self.as_prop(key)
      key.strip
    end

    def self.all_mandatory_keys_are_present(props)
      missing = MANDATORY_PROP_KEYS.reject { |k| props.key?(as_prop(k)) }
      if missing.empty?
        true
      else
        GrafanaConfigSeeder.logger.error "Missing required entries in the configuration: #{missing.join(' ')}"
        false
      end
    end

  end
end

GrafanaConfigSeeder.main if __FILE__ == $PROGRAM_NAME

# rubocop:enable Style/NumericLiteralPrefix, Metrics/MethodLength
