Skip to main content
Added prettify hints; input format seems to be machine-name last, not first
Source Link
Toby Speight
  • 88.3k
  • 14
  • 104
  • 327
  • Each record is semicolon-separated.
  • Fields:
    1. Machine name (string)
    2. Month (in MM.YYYY format)
    3. Data size (numeric value with a "GB" suffix)
    4. Machine name (string)
05.2024;1178.88 GB;IMBI-0002
04.2024;8259.91 GB;OCI-0001
04.2024;1972.24 GB;MATHI-0001
05.2024;4377.87 GB;TEST-0001
05.2024;42930.72 GB;IUP-0001
05.2024;98.79 GB;URZ-0002
05.2024;6999.23 GB;IDF-0001
04.2024;0.11 GB;INTERN
04.2024;13560.51 GB;IMSE-A-0001
05.2024;125161.29 GB;MEDMA-0002
05.2024;900878.88 GB;HITS-A-0001
05.2024;4704.86 GB;IPMB-0001
05.2024;74438.6 GB;ZMBH-A-0001
05.2024;2935.98 GB;ZUV-0004
04.2024;958.78 GB;DBIO-0001
04.2024;1244.26 GB;ZITI-0003
05.2024;17610.2 GB;UMA-0002
04.2024;654.47 GB;URZ-A-0001
04.2024;90342.36 GB;IWR-0001
05.2024;47060.74 GB;GEOG-0001
05.2024;1178.88 GB;IMBI-0002
04.2024;8259.91 GB;OCI-0001
04.2024;1972.24 GB;MATHI-0001
05.2024;4377.87 GB;TEST-0001
05.2024;42930.72 GB;IUP-0001
05.2024;98.79 GB;URZ-0002
05.2024;6999.23 GB;IDF-0001
04.2024;0.11 GB;INTERN
04.2024;13560.51 GB;IMSE-A-0001
05.2024;125161.29 GB;MEDMA-0002
05.2024;900878.88 GB;HITS-A-0001
05.2024;4704.86 GB;IPMB-0001
05.2024;74438.6 GB;ZMBH-A-0001
05.2024;2935.98 GB;ZUV-0004
04.2024;958.78 GB;DBIO-0001
04.2024;1244.26 GB;ZITI-0003
05.2024;17610.2 GB;UMA-0002
04.2024;654.47 GB;URZ-A-0001
04.2024;90342.36 GB;IWR-0001
05.2024;47060.74 GB;GEOG-0001
./monitor-data-changes.sh 04.csv 05.csv lenient verbose
./monitor-data-changes.sh 04.csv 05.csv lenient verbose
ZUV-0003                    100% [    161.98 GB ->     161.29 GB]
ZUV-0004                     99% [   2966.57 GB ->    2935.98 GB]
ZUV-0005                    101% [      3.06 GB ->       3.08 GB]
Ratio for COS-0001     is outside the 95-105% range
Ratio for DBIO-0001    is outside the 95-105% range
Ratio for DMATH-0001   is outside the 95-105% range
ZUV-0003                    100% [    161.98 GB ->     161.29 GB]
ZUV-0004                     99% [   2966.57 GB ->    2935.98 GB]
ZUV-0005                    101% [      3.06 GB ->       3.08 GB]
Ratio for COS-0001     is outside the 95-105% range
Ratio for DBIO-0001    is outside the 95-105% range
Ratio for DMATH-0001   is outside the 95-105% range
#!/bin/bash

# This is a driver for Awk script "./monitor-data-changes.awk",
# which see for detailed explanations.

# Thure Dührsen, 2024-06-11..12        Initial version

if [ "$#" -lt 3 ] || [ "$#" -gt 4 ]; then
    echo "Usage: "
    echo "             $0 file1 file2 strict"
    echo "             $0 file1 file2 lenient"
    echo "             $0 file1 file2 strict  verbose"
    echo "             $0 file1 file2 lenient verbose"
    echo
    echo "where file1 and file2 are CSV files,"
    echo "and 'strict', 'lenient', 'verbose' are typed exactly as shown."
    exit 2
fi

if [ ! -f "$1" ]; then
    echo "First argument is not a file"
    exit 2
fi

if [ ! -f "$2" ]; then
    echo "Second argument is not a file"
    exit 2
fi

if [ "$3" != 'strict' ] && [ "$3" != 'lenient' ]; then
    echo "Third argument is neither 'strict' nor 'lenient'"
    exit 2
fi

if [ "$#" -eq 4 ] && [ "$4" != 'verbose' ]; then
    echo "Fourth argument, if present, must be 'verbose'"
    exit 2
fi

awk -F';' -v OFS=';' '
/^#/ {next} # Skip comment lines
/^$/ {next} # Skip empty lines
NF != 3 {
    print "need exactly three fields per line" > "/dev/stderr"
    exit 1
}
{print $3, $1, $2}
' "$1" "$2"                                                      |
awk -f ./monitor-data-changes.awk -v strict="$3" -v verbose="$4" |
sort -t';' -k1,1 -k2,2 -k3,3                                     |
cut -d';' -f2-                                                   |
column -s ';' -t
#!/bin/bash

# This is a driver for Awk script "./monitor-data-changes.awk",
# which see for detailed explanations.

# Thure Dührsen, 2024-06-11..12        Initial version

if [ "$#" -lt 3 ] || [ "$#" -gt 4 ]; then
    echo "Usage: "
    echo "             $0 file1 file2 strict"
    echo "             $0 file1 file2 lenient"
    echo "             $0 file1 file2 strict  verbose"
    echo "             $0 file1 file2 lenient verbose"
    echo
    echo "where file1 and file2 are CSV files,"
    echo "and 'strict', 'lenient', 'verbose' are typed exactly as shown."
    exit 2
fi

if [ ! -f "$1" ]; then
    echo "First argument is not a file"
    exit 2
fi

if [ ! -f "$2" ]; then
    echo "Second argument is not a file"
    exit 2
fi

if [ "$3" != 'strict' ] && [ "$3" != 'lenient' ]; then
    echo "Third argument is neither 'strict' nor 'lenient'"
    exit 2
fi

if [ "$#" -eq 4 ] && [ "$4" != 'verbose' ]; then
    echo "Fourth argument, if present, must be 'verbose'"
    exit 2
fi

awk -F';' -v OFS=';' '
/^#/ {next} # Skip comment lines
/^$/ {next} # Skip empty lines
NF != 3 {
    print "need exactly three fields per line" > "/dev/stderr"
    exit 1
}
{print $3, $1, $2}
' "$1" "$2"                                                      |
awk -f ./monitor-data-changes.awk -v strict="$3" -v verbose="$4" |
sort -t';' -k1,1 -k2,2 -k3,3                                     |
cut -d';' -f2-                                                   |
column -s ';' -t
# Thure Dührsen, 2024-06-11..12        Initial version
# 2024-06-22                           Ensure consistent ratio calc on unsorted input

# For any number of machines, calculates the ratio of data usage
# between two months for each machine: amount of data in later month
# divided by amount of data in ealier month, expressed as a
# percentage.

# Ensures that data is from exactly two months and that each machine
# has at least one record per month.

# Optionally checks whether there is more than one record per month
# and machine and if so, stops processing the entire file.

# Prints the ratio and flags machines with ratios outside
# the 95-105% range.

# Command-line options (to be used in ./monitor-data-changes.sh):

# -v strict="strict"    Enable strict mode: Reject more than one record
#                       for each (machine, month) pair and stop processing
#                       the entire file if this happens
# -v strict="lenient"   Disable strict mode
#
# -v verbose="verbose"  Enable verbose mode: print ratios and sizes for
#                       all machines

# Preconditions:

# Input is CSV, concatenated from two months, with three fields in each line.
# Semicolon as field delimiter,
# first column is a machine name,
# second column is month and year (MM.YYYY),
# third column is a file size given in GB (floating-point
# number plus the string " GB").

# Exit with an error
function errprint(message) {
    print "ERROR: " message > "/dev/stderr"
    error_occurred = 1 # global error flag
    exit 1 # skip to END block
}

# Warn about bad data, but continue processing
function noteprint(message) {
    print "NOTE: " message > "/dev/stderr"
}

BEGIN {
    FS = ";"
    # Traverse arrays ordered by indices in ascending order compared
    # as strings
    # https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
    PROCINFO["sorted_in"] = "@ind_str_asc"
    error_occurred = 0  # Initialize error flag, global
    if (strict == "lenient") {strict = 0}
}

{
    key = $1 FS $2
    if (!seen[key]) {
        if ($2 !~ /^(0[1-9]|1[0-2])\.[0-9]{4}$/) {
            # TODO: Better check for year range
            errprint("Not a valid MM.YYYY: " $2)
        }
        if ($3 !~ /^[0-9]+(\.[0-9]+)?[[:space:]]*GB$/) {
            errprint("Not a valid size: " $3)
        }
        data[key] = $3
        seen[key] = 1
    } else {
        msg = "Duplicate data for machine " $1 " in month " $2
        if (strict) {
            errprint(msg)
        } else {
            noteprint(msg)
        }
    }

    months[$2]++
    machines[$1]++
}

END {
    # Skip the END block if an error occurred
    if (error_occurred) {
        exit 1
    }

    if (length(months) != 2) {
        msg="Data across the entire set is not from exactly two months"
        errprint(msg)
    }

    for (machine in machines) {
        count = 0
        for (month in months) {
            if ((machine FS month) in data) {
                count++
            }
        }
        if (count != 2) {
            msg = "Machine " machine " does not have exactly one record per month"
            noteprint(msg)
            delete machines[machine]
        }
    }

    allok = 1
    for (machine in machines) {
        split("", sizes)  # Clear sizes array
        i = 1
        for (month in months) {
            sizes[i++] = data[machine FS month]
        }

        ratio = sizes[2] / sizes[1] * 100

        if (verbose) {
            printf "1;%s;%9.0f%% [%10.2f GB -> %10.2f GB]\n",
                machine, ratio, sizes[1], sizes[2]
        }

        if (ratio < 95 || ratio > 105) {
            allok = 0
            print "2;Ratio for " machine "; is outside the 95-105% range"
        }
    }
    print (allok ? "3;Ratio for all machines in 95-105% range" : "")
}
# Thure Dührsen, 2024-06-11..12        Initial version
# 2024-06-22                           Ensure consistent ratio calc on unsorted input

# For any number of machines, calculates the ratio of data usage
# between two months for each machine: amount of data in later month
# divided by amount of data in ealier month, expressed as a
# percentage.

# Ensures that data is from exactly two months and that each machine
# has at least one record per month.

# Optionally checks whether there is more than one record per month
# and machine and if so, stops processing the entire file.

# Prints the ratio and flags machines with ratios outside
# the 95-105% range.

# Command-line options (to be used in ./monitor-data-changes.sh):

# -v strict="strict"    Enable strict mode: Reject more than one record
#                       for each (machine, month) pair and stop processing
#                       the entire file if this happens
# -v strict="lenient"   Disable strict mode
#
# -v verbose="verbose"  Enable verbose mode: print ratios and sizes for
#                       all machines

# Preconditions:

# Input is CSV, concatenated from two months, with three fields in each line.
# Semicolon as field delimiter,
# first column is a machine name,
# second column is month and year (MM.YYYY),
# third column is a file size given in GB (floating-point
# number plus the string " GB").

# Exit with an error
function errprint(message) {
    print "ERROR: " message > "/dev/stderr"
    error_occurred = 1 # global error flag
    exit 1 # skip to END block
}

# Warn about bad data, but continue processing
function noteprint(message) {
    print "NOTE: " message > "/dev/stderr"
}

BEGIN {
    FS = ";"
    # Traverse arrays ordered by indices in ascending order compared
    # as strings
    # https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
    PROCINFO["sorted_in"] = "@ind_str_asc"
    error_occurred = 0  # Initialize error flag, global
    if (strict == "lenient") {strict = 0}
}

{
    key = $1 FS $2
    if (!seen[key]) {
        if ($2 !~ /^(0[1-9]|1[0-2])\.[0-9]{4}$/) {
            # TODO: Better check for year range
            errprint("Not a valid MM.YYYY: " $2)
        }
        if ($3 !~ /^[0-9]+(\.[0-9]+)?[[:space:]]*GB$/) {
            errprint("Not a valid size: " $3)
        }
        data[key] = $3
        seen[key] = 1
    } else {
        msg = "Duplicate data for machine " $1 " in month " $2
        if (strict) {
            errprint(msg)
        } else {
            noteprint(msg)
        }
    }

    months[$2]++
    machines[$1]++
}

END {
    # Skip the END block if an error occurred
    if (error_occurred) {
        exit 1
    }

    if (length(months) != 2) {
        msg="Data across the entire set is not from exactly two months"
        errprint(msg)
    }

    for (machine in machines) {
        count = 0
        for (month in months) {
            if ((machine FS month) in data) {
                count++
            }
        }
        if (count != 2) {
            msg = "Machine " machine " does not have exactly one record per month"
            noteprint(msg)
            delete machines[machine]
        }
    }

    allok = 1
    for (machine in machines) {
        split("", sizes)  # Clear sizes array
        i = 1
        for (month in months) {
            sizes[i++] = data[machine FS month]
        }

        ratio = sizes[2] / sizes[1] * 100

        if (verbose) {
            printf "1;%s;%9.0f%% [%10.2f GB -> %10.2f GB]\n",
                machine, ratio, sizes[1], sizes[2]
        }

        if (ratio < 95 || ratio > 105) {
            allok = 0
            print "2;Ratio for " machine "; is outside the 95-105% range"
        }
    }
    print (allok ? "3;Ratio for all machines in 95-105% range" : "")
}
  • Each record is semicolon-separated.
  • Fields:
    1. Machine name (string)
    2. Month (in MM.YYYY format)
    3. Data size (numeric value with a "GB" suffix)
05.2024;1178.88 GB;IMBI-0002
04.2024;8259.91 GB;OCI-0001
04.2024;1972.24 GB;MATHI-0001
05.2024;4377.87 GB;TEST-0001
05.2024;42930.72 GB;IUP-0001
05.2024;98.79 GB;URZ-0002
05.2024;6999.23 GB;IDF-0001
04.2024;0.11 GB;INTERN
04.2024;13560.51 GB;IMSE-A-0001
05.2024;125161.29 GB;MEDMA-0002
05.2024;900878.88 GB;HITS-A-0001
05.2024;4704.86 GB;IPMB-0001
05.2024;74438.6 GB;ZMBH-A-0001
05.2024;2935.98 GB;ZUV-0004
04.2024;958.78 GB;DBIO-0001
04.2024;1244.26 GB;ZITI-0003
05.2024;17610.2 GB;UMA-0002
04.2024;654.47 GB;URZ-A-0001
04.2024;90342.36 GB;IWR-0001
05.2024;47060.74 GB;GEOG-0001
./monitor-data-changes.sh 04.csv 05.csv lenient verbose
ZUV-0003                    100% [    161.98 GB ->     161.29 GB]
ZUV-0004                     99% [   2966.57 GB ->    2935.98 GB]
ZUV-0005                    101% [      3.06 GB ->       3.08 GB]
Ratio for COS-0001     is outside the 95-105% range
Ratio for DBIO-0001    is outside the 95-105% range
Ratio for DMATH-0001   is outside the 95-105% range
#!/bin/bash

# This is a driver for Awk script "./monitor-data-changes.awk",
# which see for detailed explanations.

# Thure Dührsen, 2024-06-11..12        Initial version

if [ "$#" -lt 3 ] || [ "$#" -gt 4 ]; then
    echo "Usage: "
    echo "             $0 file1 file2 strict"
    echo "             $0 file1 file2 lenient"
    echo "             $0 file1 file2 strict  verbose"
    echo "             $0 file1 file2 lenient verbose"
    echo
    echo "where file1 and file2 are CSV files,"
    echo "and 'strict', 'lenient', 'verbose' are typed exactly as shown."
    exit 2
fi

if [ ! -f "$1" ]; then
    echo "First argument is not a file"
    exit 2
fi

if [ ! -f "$2" ]; then
    echo "Second argument is not a file"
    exit 2
fi

if [ "$3" != 'strict' ] && [ "$3" != 'lenient' ]; then
    echo "Third argument is neither 'strict' nor 'lenient'"
    exit 2
fi

if [ "$#" -eq 4 ] && [ "$4" != 'verbose' ]; then
    echo "Fourth argument, if present, must be 'verbose'"
    exit 2
fi

awk -F';' -v OFS=';' '
/^#/ {next} # Skip comment lines
/^$/ {next} # Skip empty lines
NF != 3 {
    print "need exactly three fields per line" > "/dev/stderr"
    exit 1
}
{print $3, $1, $2}
' "$1" "$2"                                                      |
awk -f ./monitor-data-changes.awk -v strict="$3" -v verbose="$4" |
sort -t';' -k1,1 -k2,2 -k3,3                                     |
cut -d';' -f2-                                                   |
column -s ';' -t
# Thure Dührsen, 2024-06-11..12        Initial version
# 2024-06-22                           Ensure consistent ratio calc on unsorted input

# For any number of machines, calculates the ratio of data usage
# between two months for each machine: amount of data in later month
# divided by amount of data in ealier month, expressed as a
# percentage.

# Ensures that data is from exactly two months and that each machine
# has at least one record per month.

# Optionally checks whether there is more than one record per month
# and machine and if so, stops processing the entire file.

# Prints the ratio and flags machines with ratios outside
# the 95-105% range.

# Command-line options (to be used in ./monitor-data-changes.sh):

# -v strict="strict"    Enable strict mode: Reject more than one record
#                       for each (machine, month) pair and stop processing
#                       the entire file if this happens
# -v strict="lenient"   Disable strict mode
#
# -v verbose="verbose"  Enable verbose mode: print ratios and sizes for
#                       all machines

# Preconditions:

# Input is CSV, concatenated from two months, with three fields in each line.
# Semicolon as field delimiter,
# first column is a machine name,
# second column is month and year (MM.YYYY),
# third column is a file size given in GB (floating-point
# number plus the string " GB").

# Exit with an error
function errprint(message) {
    print "ERROR: " message > "/dev/stderr"
    error_occurred = 1 # global error flag
    exit 1 # skip to END block
}

# Warn about bad data, but continue processing
function noteprint(message) {
    print "NOTE: " message > "/dev/stderr"
}

BEGIN {
    FS = ";"
    # Traverse arrays ordered by indices in ascending order compared
    # as strings
    # https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
    PROCINFO["sorted_in"] = "@ind_str_asc"
    error_occurred = 0  # Initialize error flag, global
    if (strict == "lenient") {strict = 0}
}

{
    key = $1 FS $2
    if (!seen[key]) {
        if ($2 !~ /^(0[1-9]|1[0-2])\.[0-9]{4}$/) {
            # TODO: Better check for year range
            errprint("Not a valid MM.YYYY: " $2)
        }
        if ($3 !~ /^[0-9]+(\.[0-9]+)?[[:space:]]*GB$/) {
            errprint("Not a valid size: " $3)
        }
        data[key] = $3
        seen[key] = 1
    } else {
        msg = "Duplicate data for machine " $1 " in month " $2
        if (strict) {
            errprint(msg)
        } else {
            noteprint(msg)
        }
    }

    months[$2]++
    machines[$1]++
}

END {
    # Skip the END block if an error occurred
    if (error_occurred) {
        exit 1
    }

    if (length(months) != 2) {
        msg="Data across the entire set is not from exactly two months"
        errprint(msg)
    }

    for (machine in machines) {
        count = 0
        for (month in months) {
            if ((machine FS month) in data) {
                count++
            }
        }
        if (count != 2) {
            msg = "Machine " machine " does not have exactly one record per month"
            noteprint(msg)
            delete machines[machine]
        }
    }

    allok = 1
    for (machine in machines) {
        split("", sizes)  # Clear sizes array
        i = 1
        for (month in months) {
            sizes[i++] = data[machine FS month]
        }

        ratio = sizes[2] / sizes[1] * 100

        if (verbose) {
            printf "1;%s;%9.0f%% [%10.2f GB -> %10.2f GB]\n",
                machine, ratio, sizes[1], sizes[2]
        }

        if (ratio < 95 || ratio > 105) {
            allok = 0
            print "2;Ratio for " machine "; is outside the 95-105% range"
        }
    }
    print (allok ? "3;Ratio for all machines in 95-105% range" : "")
}
  • Each record is semicolon-separated.
  • Fields:
    1. Month (in MM.YYYY format)
    2. Data size (numeric value with a "GB" suffix)
    3. Machine name (string)
05.2024;1178.88 GB;IMBI-0002
04.2024;8259.91 GB;OCI-0001
04.2024;1972.24 GB;MATHI-0001
05.2024;4377.87 GB;TEST-0001
05.2024;42930.72 GB;IUP-0001
05.2024;98.79 GB;URZ-0002
05.2024;6999.23 GB;IDF-0001
04.2024;0.11 GB;INTERN
04.2024;13560.51 GB;IMSE-A-0001
05.2024;125161.29 GB;MEDMA-0002
05.2024;900878.88 GB;HITS-A-0001
05.2024;4704.86 GB;IPMB-0001
05.2024;74438.6 GB;ZMBH-A-0001
05.2024;2935.98 GB;ZUV-0004
04.2024;958.78 GB;DBIO-0001
04.2024;1244.26 GB;ZITI-0003
05.2024;17610.2 GB;UMA-0002
04.2024;654.47 GB;URZ-A-0001
04.2024;90342.36 GB;IWR-0001
05.2024;47060.74 GB;GEOG-0001
./monitor-data-changes.sh 04.csv 05.csv lenient verbose
ZUV-0003                    100% [    161.98 GB ->     161.29 GB]
ZUV-0004                     99% [   2966.57 GB ->    2935.98 GB]
ZUV-0005                    101% [      3.06 GB ->       3.08 GB]
Ratio for COS-0001     is outside the 95-105% range
Ratio for DBIO-0001    is outside the 95-105% range
Ratio for DMATH-0001   is outside the 95-105% range
#!/bin/bash

# This is a driver for Awk script "./monitor-data-changes.awk",
# which see for detailed explanations.

# Thure Dührsen, 2024-06-11..12        Initial version

if [ "$#" -lt 3 ] || [ "$#" -gt 4 ]; then
    echo "Usage: "
    echo "             $0 file1 file2 strict"
    echo "             $0 file1 file2 lenient"
    echo "             $0 file1 file2 strict  verbose"
    echo "             $0 file1 file2 lenient verbose"
    echo
    echo "where file1 and file2 are CSV files,"
    echo "and 'strict', 'lenient', 'verbose' are typed exactly as shown."
    exit 2
fi

if [ ! -f "$1" ]; then
    echo "First argument is not a file"
    exit 2
fi

if [ ! -f "$2" ]; then
    echo "Second argument is not a file"
    exit 2
fi

if [ "$3" != 'strict' ] && [ "$3" != 'lenient' ]; then
    echo "Third argument is neither 'strict' nor 'lenient'"
    exit 2
fi

if [ "$#" -eq 4 ] && [ "$4" != 'verbose' ]; then
    echo "Fourth argument, if present, must be 'verbose'"
    exit 2
fi

awk -F';' -v OFS=';' '
/^#/ {next} # Skip comment lines
/^$/ {next} # Skip empty lines
NF != 3 {
    print "need exactly three fields per line" > "/dev/stderr"
    exit 1
}
{print $3, $1, $2}
' "$1" "$2"                                                      |
awk -f ./monitor-data-changes.awk -v strict="$3" -v verbose="$4" |
sort -t';' -k1,1 -k2,2 -k3,3                                     |
cut -d';' -f2-                                                   |
column -s ';' -t
# Thure Dührsen, 2024-06-11..12        Initial version
# 2024-06-22                           Ensure consistent ratio calc on unsorted input

# For any number of machines, calculates the ratio of data usage
# between two months for each machine: amount of data in later month
# divided by amount of data in ealier month, expressed as a
# percentage.

# Ensures that data is from exactly two months and that each machine
# has at least one record per month.

# Optionally checks whether there is more than one record per month
# and machine and if so, stops processing the entire file.

# Prints the ratio and flags machines with ratios outside
# the 95-105% range.

# Command-line options (to be used in ./monitor-data-changes.sh):

# -v strict="strict"    Enable strict mode: Reject more than one record
#                       for each (machine, month) pair and stop processing
#                       the entire file if this happens
# -v strict="lenient"   Disable strict mode
#
# -v verbose="verbose"  Enable verbose mode: print ratios and sizes for
#                       all machines

# Preconditions:

# Input is CSV, concatenated from two months, with three fields in each line.
# Semicolon as field delimiter,
# first column is a machine name,
# second column is month and year (MM.YYYY),
# third column is a file size given in GB (floating-point
# number plus the string " GB").

# Exit with an error
function errprint(message) {
    print "ERROR: " message > "/dev/stderr"
    error_occurred = 1 # global error flag
    exit 1 # skip to END block
}

# Warn about bad data, but continue processing
function noteprint(message) {
    print "NOTE: " message > "/dev/stderr"
}

BEGIN {
    FS = ";"
    # Traverse arrays ordered by indices in ascending order compared
    # as strings
    # https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
    PROCINFO["sorted_in"] = "@ind_str_asc"
    error_occurred = 0  # Initialize error flag, global
    if (strict == "lenient") {strict = 0}
}

{
    key = $1 FS $2
    if (!seen[key]) {
        if ($2 !~ /^(0[1-9]|1[0-2])\.[0-9]{4}$/) {
            # TODO: Better check for year range
            errprint("Not a valid MM.YYYY: " $2)
        }
        if ($3 !~ /^[0-9]+(\.[0-9]+)?[[:space:]]*GB$/) {
            errprint("Not a valid size: " $3)
        }
        data[key] = $3
        seen[key] = 1
    } else {
        msg = "Duplicate data for machine " $1 " in month " $2
        if (strict) {
            errprint(msg)
        } else {
            noteprint(msg)
        }
    }

    months[$2]++
    machines[$1]++
}

END {
    # Skip the END block if an error occurred
    if (error_occurred) {
        exit 1
    }

    if (length(months) != 2) {
        msg="Data across the entire set is not from exactly two months"
        errprint(msg)
    }

    for (machine in machines) {
        count = 0
        for (month in months) {
            if ((machine FS month) in data) {
                count++
            }
        }
        if (count != 2) {
            msg = "Machine " machine " does not have exactly one record per month"
            noteprint(msg)
            delete machines[machine]
        }
    }

    allok = 1
    for (machine in machines) {
        split("", sizes)  # Clear sizes array
        i = 1
        for (month in months) {
            sizes[i++] = data[machine FS month]
        }

        ratio = sizes[2] / sizes[1] * 100

        if (verbose) {
            printf "1;%s;%9.0f%% [%10.2f GB -> %10.2f GB]\n",
                machine, ratio, sizes[1], sizes[2]
        }

        if (ratio < 95 || ratio > 105) {
            allok = 0
            print "2;Ratio for " machine "; is outside the 95-105% range"
        }
    }
    print (allok ? "3;Ratio for all machines in 95-105% range" : "")
}
Became Hot Network Question
Source Link

Generate filesystem usage report using Awk

I would like to request a review of an Awk program I have written. This script meets an immediate business need and, ideally, serves as a coding style example for new hires (who may not yet be proficient in Awk). Consequently, the script is not as compact as it could be, and not all error conditions (e.g., negative data amounts, empty machine names) are checked. The main review questions are, therefore, more pedagogical in nature:

  • Would you put all of the logic into the Awk program, or leave some of the validation in the shell driver?
  • Does the Awk program need more comments?
  • Can you think of anything else to make the programs easier to understand to novices?
  • Regarding easy maintenance, is there any reason for a coding style change?

I would also like to ask an expert question.

  • Are there more intuitive ways to sort the output? Prepending 1; or 2; or 3; works, but.... meh.

First, the specification:

Objective

Create an AWK program to process a semicolon-separated data file containing information about machine data usage over two months. The program should validate the data, identify and report any errors or inconsistencies, and calculate usage ratios between the two months for each machine.

Input Data Format

  • Each record is semicolon-separated.
  • Fields:
    1. Machine name (string)
    2. Month (in MM.YYYY format)
    3. Data size (numeric value with a "GB" suffix)

Requirements

  1. Input Validation:

    • Ensure each record's second field (month) follows the MM.YYYY format.
    • Ensure each record's third field (data size) is a valid numeric value followed by "GB".
    • If a record does not conform to these formats, print an error message and terminate the program.
  2. Duplicate Handling:

    • Detect and handle duplicate entries for the same machine and month.
    • If running in strict mode, terminate the program on encountering a duplicate.
    • If not in strict mode, print a warning and continue processing.
  3. Data Consistency:

    • Ensure the data set contains records from exactly two distinct months.
    • Ensure each machine has exactly one record for each of the two months.
    • If a machine does not have records for both months, exclude it from the ratio calculation and print a warning.
  4. Ratio Calculation:

    • For each machine with valid records for both months, calculate the ratio of data size between the second and the first month.
    • If the ratio is outside the 95-105% range, print a warning for that machine.
  5. Output:

    • Print the ratio for each machine along with the original data sizes if the verbose mode is enabled.
    • Print a summary message indicating whether all machines have ratios within the 95-105% range.

Typical input looks like this (cat 04.csv 05.csv | shuf | head -n 20):

05.2024;1178.88 GB;IMBI-0002
04.2024;8259.91 GB;OCI-0001
04.2024;1972.24 GB;MATHI-0001
05.2024;4377.87 GB;TEST-0001
05.2024;42930.72 GB;IUP-0001
05.2024;98.79 GB;URZ-0002
05.2024;6999.23 GB;IDF-0001
04.2024;0.11 GB;INTERN
04.2024;13560.51 GB;IMSE-A-0001
05.2024;125161.29 GB;MEDMA-0002
05.2024;900878.88 GB;HITS-A-0001
05.2024;4704.86 GB;IPMB-0001
05.2024;74438.6 GB;ZMBH-A-0001
05.2024;2935.98 GB;ZUV-0004
04.2024;958.78 GB;DBIO-0001
04.2024;1244.26 GB;ZITI-0003
05.2024;17610.2 GB;UMA-0002
04.2024;654.47 GB;URZ-A-0001
04.2024;90342.36 GB;IWR-0001
05.2024;47060.74 GB;GEOG-0001

Calling the driver:

./monitor-data-changes.sh 04.csv 05.csv lenient verbose

Typical output:

ZUV-0003                    100% [    161.98 GB ->     161.29 GB]
ZUV-0004                     99% [   2966.57 GB ->    2935.98 GB]
ZUV-0005                    101% [      3.06 GB ->       3.08 GB]
Ratio for COS-0001     is outside the 95-105% range
Ratio for DBIO-0001    is outside the 95-105% range
Ratio for DMATH-0001   is outside the 95-105% range

The driver script:

#!/bin/bash

# This is a driver for Awk script "./monitor-data-changes.awk",
# which see for detailed explanations.

# Thure Dührsen, 2024-06-11..12        Initial version

if [ "$#" -lt 3 ] || [ "$#" -gt 4 ]; then
    echo "Usage: "
    echo "             $0 file1 file2 strict"
    echo "             $0 file1 file2 lenient"
    echo "             $0 file1 file2 strict  verbose"
    echo "             $0 file1 file2 lenient verbose"
    echo
    echo "where file1 and file2 are CSV files,"
    echo "and 'strict', 'lenient', 'verbose' are typed exactly as shown."
    exit 2
fi

if [ ! -f "$1" ]; then
    echo "First argument is not a file"
    exit 2
fi

if [ ! -f "$2" ]; then
    echo "Second argument is not a file"
    exit 2
fi

if [ "$3" != 'strict' ] && [ "$3" != 'lenient' ]; then
    echo "Third argument is neither 'strict' nor 'lenient'"
    exit 2
fi

if [ "$#" -eq 4 ] && [ "$4" != 'verbose' ]; then
    echo "Fourth argument, if present, must be 'verbose'"
    exit 2
fi

awk -F';' -v OFS=';' '
/^#/ {next} # Skip comment lines
/^$/ {next} # Skip empty lines
NF != 3 {
    print "need exactly three fields per line" > "/dev/stderr"
    exit 1
}
{print $3, $1, $2}
' "$1" "$2"                                                      |
awk -f ./monitor-data-changes.awk -v strict="$3" -v verbose="$4" |
sort -t';' -k1,1 -k2,2 -k3,3                                     |
cut -d';' -f2-                                                   |
column -s ';' -t

The Awk script:

# Thure Dührsen, 2024-06-11..12        Initial version
# 2024-06-22                           Ensure consistent ratio calc on unsorted input

# For any number of machines, calculates the ratio of data usage
# between two months for each machine: amount of data in later month
# divided by amount of data in ealier month, expressed as a
# percentage.

# Ensures that data is from exactly two months and that each machine
# has at least one record per month.

# Optionally checks whether there is more than one record per month
# and machine and if so, stops processing the entire file.

# Prints the ratio and flags machines with ratios outside
# the 95-105% range.

# Command-line options (to be used in ./monitor-data-changes.sh):

# -v strict="strict"    Enable strict mode: Reject more than one record
#                       for each (machine, month) pair and stop processing
#                       the entire file if this happens
# -v strict="lenient"   Disable strict mode
#
# -v verbose="verbose"  Enable verbose mode: print ratios and sizes for
#                       all machines

# Preconditions:

# Input is CSV, concatenated from two months, with three fields in each line.
# Semicolon as field delimiter,
# first column is a machine name,
# second column is month and year (MM.YYYY),
# third column is a file size given in GB (floating-point
# number plus the string " GB").

# Exit with an error
function errprint(message) {
    print "ERROR: " message > "/dev/stderr"
    error_occurred = 1 # global error flag
    exit 1 # skip to END block
}

# Warn about bad data, but continue processing
function noteprint(message) {
    print "NOTE: " message > "/dev/stderr"
}

BEGIN {
    FS = ";"
    # Traverse arrays ordered by indices in ascending order compared
    # as strings
    # https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
    PROCINFO["sorted_in"] = "@ind_str_asc"
    error_occurred = 0  # Initialize error flag, global
    if (strict == "lenient") {strict = 0}
}

{
    key = $1 FS $2
    if (!seen[key]) {
        if ($2 !~ /^(0[1-9]|1[0-2])\.[0-9]{4}$/) {
            # TODO: Better check for year range
            errprint("Not a valid MM.YYYY: " $2)
        }
        if ($3 !~ /^[0-9]+(\.[0-9]+)?[[:space:]]*GB$/) {
            errprint("Not a valid size: " $3)
        }
        data[key] = $3
        seen[key] = 1
    } else {
        msg = "Duplicate data for machine " $1 " in month " $2
        if (strict) {
            errprint(msg)
        } else {
            noteprint(msg)
        }
    }

    months[$2]++
    machines[$1]++
}

END {
    # Skip the END block if an error occurred
    if (error_occurred) {
        exit 1
    }

    if (length(months) != 2) {
        msg="Data across the entire set is not from exactly two months"
        errprint(msg)
    }

    for (machine in machines) {
        count = 0
        for (month in months) {
            if ((machine FS month) in data) {
                count++
            }
        }
        if (count != 2) {
            msg = "Machine " machine " does not have exactly one record per month"
            noteprint(msg)
            delete machines[machine]
        }
    }

    allok = 1
    for (machine in machines) {
        split("", sizes)  # Clear sizes array
        i = 1
        for (month in months) {
            sizes[i++] = data[machine FS month]
        }

        ratio = sizes[2] / sizes[1] * 100

        if (verbose) {
            printf "1;%s;%9.0f%% [%10.2f GB -> %10.2f GB]\n",
                machine, ratio, sizes[1], sizes[2]
        }

        if (ratio < 95 || ratio > 105) {
            allok = 0
            print "2;Ratio for " machine "; is outside the 95-105% range"
        }
    }
    print (allok ? "3;Ratio for all machines in 95-105% range" : "")
}