#!/usr/bin/perl
# -*- perl -*-

# Copyright (C) 2015-2025 Pirx Developers - https://pirx.dev/
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

=head1 NAME

hq_smart - Munin plugin to monitor disk S.M.A.R.T. attributes.

=head1 APPLICABLE SYSTEMS

Linux systems with smartctl command available.

=head1 CONFIGURATION

This plugin must be run as root so smartctl can access disk devices.
Just add following to your munin node configuration.

[hq_smart]
  user root

=head1 VERSION

  20250325

=head1 MAGIC MARKERS

  #%# family=manual
  #%# capabilities=autoconf

=head1 BUGS

None.

=head1 AUTHOR

Pirx Developers - https://pirx.dev/

=head1 LICENSE

GPLv3

=cut

use strict;
use warnings;
use Munin::Plugin;

sub read_smart;

# Handle autoconf
if(defined($ARGV[0]) and $ARGV[0] eq 'autoconf') {
  print("yes\n");
  exit(0);
}

#need_multigraph();

my %disks;
my %smart_data;
my %raid_devices;
my @block_devs = glob("/sys/block/*");
@block_devs = map(m!/sys/block/([^/]+)!, @block_devs);

BLOCK_DEVS_LOOP: foreach my $block_dev (@block_devs) {
  if($block_dev =~ m/^(fd|scr|sr|ram|loop|md|dm\-)[0-9]/) {
    next(BLOCK_DEVS_LOOP);
  }
  if($block_dev =~ m/^hd[a-z]/) {
    my $device_type = `cat /sys/block/$block_dev/device/media 2>/dev/null`;
    chomp($device_type);
    if($device_type eq "cdrom" or $device_type eq "floppy") {
      next(BLOCK_DEVS_LOOP);
    }
  }
  my $status = read_smart("/dev/" . $block_dev, "", ""); 
  if($status == 127 or $status == -1) {
    print("Error running smartctl. Exiting.\n");
    exit(1);
  }
  if(defined($smart_data{'raid'})) {
    $raid_devices{"/dev/" . $block_dev} = $smart_data{'raid'};
  }
  if(defined($smart_data{'serial'})) {
    $disks{$smart_data{'serial'}} = { %smart_data };
  }
}

foreach my $device (keys %raid_devices) {
  if($raid_devices{$device} eq "megaraid" or $raid_devices{$device} eq "perc_h310") {
    my $status = 0;
    for(my $sub_device = 0; $sub_device < 32; $sub_device++) {
      $status = read_smart($device, "megaraid", $sub_device);
      if($status == 127 or $status == -1) {
        print("Error running smartctl. Exiting.\n");
        exit(1);
      }    
      if(defined($smart_data{'serial'})) {
        $disks{$smart_data{'serial'}} = { %smart_data };
      }
    }
  }
  if($raid_devices{$device} eq "3ware") {
    my @twa_devs = glob("/dev/twa[0-9]*");
    for(my $twa_device = 0; $twa_device < 16; $twa_device++) {
      my $status = 0;
      for(my $sub_device = 0; $sub_device < 32; $sub_device++) {
        $status = read_smart("/dev/twa" . $twa_device, "3ware", $sub_device);
        if($status == 127 or $status == -1) {
          print("Error running smartctl. Exiting.\n");
          exit(1);
        }    
        if(defined($smart_data{'serial'})) {
          $disks{$smart_data{'serial'}} = { %smart_data };
        }
      }
    }
  }
}

if(defined($ARGV[0]) and $ARGV[0] eq 'config') {
  foreach my $id (keys %disks) {
    my $graph_name = $id;
    $graph_name =~ s/[^a-zA-Z0-9]/_/g;
    if(defined($disks{$id}{'model'})) {
      if(defined($disks{$id}{'smart'}{'5'}{'raw'}) or
         defined($disks{$id}{'smart'}{'196'}{'raw'}) or         
         defined($disks{$id}{'smart'}{'197'}{'raw'}) or
         defined($disks{$id}{'smart'}{'198'}{'raw'}) or
         defined($disks{$id}{'media_errors'}) or
         defined($disks{$id}{'smart'}{'1'}{'raw'}) or
         defined($disks{$id}{'smart'}{'7'}{'raw'}) or
         defined($disks{$id}{'smart'}{'10'}{'raw'}) or
         defined($disks{$id}{'smart'}{'187'}{'raw'}) or
         defined($disks{$id}{'smart'}{'188'}{'raw'})
        ) {
        print <<EOF;
multigraph disk_errors_$graph_name
graph_title $disks{$id}{'model'} (s/n: $disks{$id}{'serial'})
graph_args --base 1000 --lower-limit 0
graph_scale no
graph_vlabel errors
graph_category disk_smart
EOF
        if(defined($disks{$id}{'smart'}{'5'}{'raw'})) {
          print <<EOF;
reallocated_sectors.label Reallocated sectors
reallocated_sectors.type GAUGE
reallocated_sectors.draw LINE2
reallocated_sectors.colour 990000
reallocated_sectors.critical 0
EOF
        }
        if(defined($disks{$id}{'smart'}{'196'}{'raw'})) {
          print <<EOF;
reallocation_events.label Reallocation events
reallocation_events.type GAUGE
reallocation_events.draw LINE2
reallocation_events.colour ff9900
reallocation_events.critical 0
EOF
        }
        if(defined($disks{$id}{'smart'}{'197'}{'raw'})) {
          print <<EOF;
pending_sectors.label Pending sectors
pending_sectors.type GAUGE
pending_sectors.draw LINE2
pending_sectors.colour ff00ff
pending_sectors.critical 0
EOF
        }
        if(defined($disks{$id}{'smart'}{'198'}{'raw'})) {
          print <<EOF;
uncorrectable_sectors.label Uncorrectable sectors
uncorrectable_sectors.type GAUGE
uncorrectable_sectors.draw LINE2
uncorrectable_sectors.colour ff0000
uncorrectable_sectors.critical 0
EOF
        }
        if(defined($disks{$id}{'media_errors'})) {
          print <<EOF;
media_errors.label Media errors
media_errors.type GAUGE
media_errors.draw LINE2
media_errors.colour ff0000
media_errors.critical 0
EOF
        }
        if(defined($disks{$id}{'smart'}{'1'}{'raw'})) {
          print <<EOF;
read_errors.label Read errors
read_errors.type GAUGE
read_errors.draw LINE2
read_errors.colour 0000ff
read_errors.warning 0
EOF
        }
        if(defined($disks{$id}{'smart'}{'7'}{'raw'})) {
          print <<EOF;
seek_errors.label Seek errors
seek_errors.type GAUGE
seek_errors.draw LINE2
seek_errors.colour 3399ff
seek_errors.warning 0
EOF
        }
        if(defined($disks{$id}{'smart'}{'10'}{'raw'})) {
          print <<EOF;
spin_retries.label Spin retries
spin_retries.type GAUGE
spin_retries.draw LINE2
spin_retries.colour 00ffff
spin_retries.warning 0
EOF
        }
        if(defined($disks{$id}{'smart'}{'187'}{'raw'})) {
          print <<EOF;
uncorrectable_errors.label Uncorrectable errors
uncorrectable_errors.type GAUGE
uncorrectable_errors.draw LINE2
uncorrectable_errors.colour ff9900
uncorrectable_errors.warning 0
EOF
        }
        if(defined($disks{$id}{'smart'}{'188'}{'raw'})) {
          print <<EOF;
command_timeouts.label Command timeouts
command_timeouts.type GAUGE
command_timeouts.draw LINE2
command_timeouts.colour ffff00
command_timeouts.warning 0
EOF
        }
      }
      if(defined($disks{$id}{'ssd_wear'})) {
        print <<EOF;
multigraph ssd_wear_$graph_name
graph_title $disks{$id}{'model'} (s/n: $disks{$id}{'serial'})
graph_args --base 1000 --lower-limit 0
graph_scale no
graph_vlabel time
graph_category disk_smart
ssd_wear.label SSD wear level
ssd_wear.type GAUGE
ssd_wear.draw AREA
ssd_wear.colour 0000ff
EOF
      }
      if(defined($disks{$id}{'temperature'})) {
        print <<EOF;
multigraph disk_temperature_$graph_name
graph_title $disks{$id}{'model'} (s/n: $disks{$id}{'serial'})
graph_args --base 1000 --lower-limit 0
graph_scale no
graph_vlabel degrees celsius
graph_category disk_smart
EOF
        my $graph_color = "00ff00";
        if($disks{$id}{'temperature'} < 25) {
          $graph_color = "0000ff";
        }
        if($disks{$id}{'temperature'} > 50) {
          $graph_color = "ff9900";
        }
        if($disks{$id}{'temperature'} > 60) {
          $graph_color = "ff0000";
        }
        print <<EOF;
temperature.label Disk temperature
temperature.type GAUGE
temperature.draw LINE2
temperature.colour $graph_color
temperature.line 40:ffff00:Warning threshold
temperature.warning 50
temperature.critical 60
EOF
      }
      if(defined($disks{$id}{'power_on_hours'})) {
        print <<EOF;
multigraph disk_poh_$graph_name
graph_title $disks{$id}{'model'} (s/n: $disks{$id}{'serial'})
graph_args --base 1000 --lower-limit 0
graph_scale no
graph_vlabel time
graph_category disk_smart
power_on_hours.label Power On Hours
power_on_hours.type GAUGE
power_on_hours.draw AREA
power_on_hours.colour 00d000
EOF
      }
      if(defined($disks{$id}{'power_cycles'})) {
        print <<EOF;
multigraph disk_power_cycles_$graph_name
graph_title $disks{$id}{'model'} (s/n: $disks{$id}{'serial'})
graph_args --base 1000 --lower-limit 0
graph_scale no
graph_vlabel cycles
graph_category disk_smart
start_stop_cycles.label Start/Stop cycles
start_stop_cycles.type GAUGE
start_stop_cycles.draw LINE2
start_stop_cycles.colour 000080
EOF
      }
    }
  }
  exit(0);
}

foreach my $id (keys %disks) {
  my $graph_name = $id;
  $graph_name =~ s/[^a-zA-Z0-9]/_/g;
  if(defined($disks{$id}{'model'})) {
    if(defined($disks{$id}{'smart'}{'5'}{'raw'}) or
       defined($disks{$id}{'smart'}{'196'}{'raw'}) or    
       defined($disks{$id}{'smart'}{'197'}{'raw'}) or
       defined($disks{$id}{'smart'}{'198'}{'raw'}) or
       defined($disks{$id}{'media_errors'}) or
       defined($disks{$id}{'smart'}{'1'}{'raw'}) or
       defined($disks{$id}{'smart'}{'7'}{'raw'}) or
       defined($disks{$id}{'smart'}{'10'}{'raw'}) or
       defined($disks{$id}{'smart'}{'187'}{'raw'}) or
       defined($disks{$id}{'smart'}{'188'}{'raw'})
      ) {
      print("multigraph disk_errors_$graph_name\n");
      if(defined($disks{$id}{'smart'}{'5'}{'raw'})) {
        print("reallocated_sectors.value " . $disks{$id}{'smart'}{'5'}{'raw'} . "\n");
      }
      if(defined($disks{$id}{'smart'}{'196'}{'raw'})) {
        print("reallocation_events.value " . $disks{$id}{'smart'}{'196'}{'raw'} . "\n");
      }
      if(defined($disks{$id}{'smart'}{'197'}{'raw'})) {
        print("pending_sectors.value " . $disks{$id}{'smart'}{'197'}{'raw'} . "\n");
      }
      if(defined($disks{$id}{'smart'}{'198'}{'raw'})) {
        print("uncorrectable_sectors.value " . $disks{$id}{'smart'}{'198'}{'raw'} . "\n");
      }
      if(defined($disks{$id}{'media_errors'})) {
        print("media_errors.value " . $disks{$id}{'media_errors'} . "\n");
      }
      if(defined($disks{$id}{'smart'}{'1'}{'raw'})) {
        print("read_errors.value " . $disks{$id}{'smart'}{'1'}{'raw'} . "\n");
      }
      if(defined($disks{$id}{'smart'}{'7'}{'raw'})) {
        print("seek_errors.value " . $disks{$id}{'smart'}{'7'}{'raw'} . "\n");
      }
      if(defined($disks{$id}{'smart'}{'10'}{'raw'})) {
        print("spin_retries.value " . $disks{$id}{'smart'}{'10'}{'raw'} . "\n");
      }
      if(defined($disks{$id}{'smart'}{'187'}{'raw'})) {
        print("uncorrectable_errors.value " . $disks{$id}{'smart'}{'187'}{'raw'} . "\n");
      }
      if(defined($disks{$id}{'smart'}{'188'}{'raw'})) {
        print("command_timeouts.value " . $disks{$id}{'smart'}{'188'}{'raw'} . "\n");
      }
    }
    if(defined($disks{$id}{'ssd_wear'})) {
      print("multigraph ssd_wear_$graph_name\n");
      print("ssd_wear.value " . $disks{$id}{'ssd_wear'} . "\n");
    }
    if(defined($disks{$id}{'temperature'})) {
      print("multigraph disk_temperature_$graph_name\n");
      if(defined($disks{$id}{'temperature'})) {
        print("temperature.value " . $disks{$id}{'temperature'} . "\n");
      }
    }
    if(defined($disks{$id}{'power_on_hours'})) {
      print("multigraph disk_poh_$graph_name\n");
      print("power_on_hours.value " . $disks{$id}{'power_on_hours'} . "\n");
    }
    if(defined($disks{$id}{'power_cycles'})) {
      print("multigraph disk_power_cycles_$graph_name\n");
      print("start_stop_cycles.value " . $disks{$id}{'power_cycles'} . "\n");
    }
  }
}

exit(0);

sub read_smart() {
  my ($device, $type, $disk) = @_;
  my $exec_fd;
  my $params = "--json=g -a";
  if($type ne "" and $disk ne "") {
    $params .= " -d " . $type . "," . $disk;
  }
  $params .= " " . $device;
  %smart_data = ();
  my $smart_id;
  if(open($exec_fd, "smartctl " . $params . " 2>/dev/null |")) {
    SMARTCTL_READ_LOOP: while(defined(my $line=<$exec_fd>)) {
      $line =~ s/[\r\n]*$//;
      $line =~ s/^json\.//;
      $line =~ s/\s*=\s*/ /;
      $line =~ s/(\"|;)//g;
      if($line =~ m/please try adding '-d (.*),N'/) {
        if($1 eq "megaraid") {
          $smart_data{'raid'} = "megaraid";
        }
        if($1 eq "3ware") {
          $smart_data{'raid'} = "3ware";
        }
      }
      if($line =~ m/^model_name\s+(.*)$/) {
        $smart_data{'model'} = $1;
      }
      if(!defined($smart_data{'model'}) and $line =~ m/^scsi_model_name\s+(.*)$/) {
        $smart_data{'model'} = $1;
      }
      if($line =~ m/^serial_number\s+(.*)$/) {
        $smart_data{'serial'} = $1;
      }
      if($line =~ m/^temperature.current\s+(.*)$/) {
        $smart_data{'temperature'} = $1;
      }
      if(!defined($smart_data{'temperature'}) and $line =~ m/^nvme_smart_health_information_log.temperature\s+(.*)$/) {
        $smart_data{'temperature'} = $1;
      }
      if($line =~ m/^power_on_hours.hours\s+(.*)$/) {
        $smart_data{'power_on_hours'} = $1;
      }
      if(!defined($smart_data{'power_on_hours'}) and $line =~ m/^nvme_smart_health_information_log.power_on_hours\s+(.*)$/) {
        $smart_data{'power_on_hours'} = $1;
      }
      if($line =~ m/^power_cycle_count\s+(.*)$/) {
        $smart_data{'power_cycles'} = $1;
      }
      if(!defined($smart_data{'power_cycles'}) and $line =~ m/^nvme_smart_health_information_log.power_cycles\s+(.*)$/) {
        $smart_data{'power_cycles'} = $1;
      }
      if($line =~ m/^nvme_smart_health_information_log.media_errors\s+(.*)$/) {
        $smart_data{'media_errors'} = $1;
      }
      if(!defined($smart_data{'media_errors'}) and $line =~ m/^scsi_grown_defect_list\s+(.*)$/) {
        $smart_data{'media_errors'} = $1;
      }
      if($line =~ m/^nvme_smart_health_information_log.percentage_used\s+(.*)$/) {
        $smart_data{'ssd_wear'} = $1;
      }
      if($line =~ m/^ata_smart_attributes.table\[[0-9]+\].id\s+(.*)$/) {
        $smart_id = $1;
      }
      if($line =~ m/^ata_smart_attributes.table\[[0-9]+\].value\s+(.*)$/) {
        $smart_data{'smart'}{$smart_id}{'value'} = $1;
      }
      if($line =~ m/^ata_smart_attributes.table\[[0-9]+\].raw.string\s+(.*)$/) {
        $smart_data{'smart'}{$smart_id}{'raw'} = $1;
      }
    }
    close($exec_fd);
    if(!defined($smart_data{'temperature'}) and defined($smart_data{'smart'}{'194'}{'raw'})) {
      $smart_data{'temperature'} = $smart_data{'smart'}{'194'}{'raw'};
      $smart_data{'temperature'} =~ s/\s+.*$//;
    }
    if(!defined($smart_data{'power_on_hours'}) and defined($smart_data{'smart'}{'9'}{'raw'})) {
      $smart_data{'power_on_hours'} = $smart_data{'smart'}{'9'}{'raw'};
    }
    if(!defined($smart_data{'power_cycles'}) and defined($smart_data{'smart'}{'12'}{'raw'})) {
      $smart_data{'power_cycles'} = $smart_data{'smart'}{'12'}{'raw'};
    }
    if(!defined($smart_data{'ssd_wear'}) and defined($smart_data{'smart'}{'173'}{'raw'})) {
      $smart_data{'ssd_wear'} = 100 - $smart_data{'smart'}{'173'}{'value'};
    }
    if(!defined($smart_data{'ssd_wear'}) and defined($smart_data{'smart'}{'177'}{'raw'})) {
      $smart_data{'ssd_wear'} = 100 - $smart_data{'smart'}{'177'}{'value'};
    }
    if(defined($smart_data{'smart'}{'1'}{'raw'})) {
      $smart_data{'smart'}{'1'}{'raw'} =~ s!/.*!!;
    }
    if(defined($smart_data{'smart'}{'7'}{'raw'})) {
      $smart_data{'smart'}{'7'}{'raw'} =~ s!/.*!!;
    }
    my $exit_code = $? >> 8;
    return($exit_code);
  }
  return(-1);
}
