- Each record is semicolon-separated.
- Fields:
- Machine name (string)
- Month (in MM.YYYY format)
- Data size (numeric value with a "GB" suffix)
- Machine name (string)
05.2024;1178.88 GB;IMBI-0002
04.2024;8259.91 GB;OCI-0001
04.2024;1972.24 GB;MATHI-0001
05.2024;4377.87 GB;TEST-0001
05.2024;42930.72 GB;IUP-0001
05.2024;98.79 GB;URZ-0002
05.2024;6999.23 GB;IDF-0001
04.2024;0.11 GB;INTERN
04.2024;13560.51 GB;IMSE-A-0001
05.2024;125161.29 GB;MEDMA-0002
05.2024;900878.88 GB;HITS-A-0001
05.2024;4704.86 GB;IPMB-0001
05.2024;74438.6 GB;ZMBH-A-0001
05.2024;2935.98 GB;ZUV-0004
04.2024;958.78 GB;DBIO-0001
04.2024;1244.26 GB;ZITI-0003
05.2024;17610.2 GB;UMA-0002
04.2024;654.47 GB;URZ-A-0001
04.2024;90342.36 GB;IWR-0001
05.2024;47060.74 GB;GEOG-0001
05.2024;1178.88 GB;IMBI-0002
04.2024;8259.91 GB;OCI-0001
04.2024;1972.24 GB;MATHI-0001
05.2024;4377.87 GB;TEST-0001
05.2024;42930.72 GB;IUP-0001
05.2024;98.79 GB;URZ-0002
05.2024;6999.23 GB;IDF-0001
04.2024;0.11 GB;INTERN
04.2024;13560.51 GB;IMSE-A-0001
05.2024;125161.29 GB;MEDMA-0002
05.2024;900878.88 GB;HITS-A-0001
05.2024;4704.86 GB;IPMB-0001
05.2024;74438.6 GB;ZMBH-A-0001
05.2024;2935.98 GB;ZUV-0004
04.2024;958.78 GB;DBIO-0001
04.2024;1244.26 GB;ZITI-0003
05.2024;17610.2 GB;UMA-0002
04.2024;654.47 GB;URZ-A-0001
04.2024;90342.36 GB;IWR-0001
05.2024;47060.74 GB;GEOG-0001
./monitor-data-changes.sh 04.csv 05.csv lenient verbose
./monitor-data-changes.sh 04.csv 05.csv lenient verbose
ZUV-0003 100% [ 161.98 GB -> 161.29 GB]
ZUV-0004 99% [ 2966.57 GB -> 2935.98 GB]
ZUV-0005 101% [ 3.06 GB -> 3.08 GB]
Ratio for COS-0001 is outside the 95-105% range
Ratio for DBIO-0001 is outside the 95-105% range
Ratio for DMATH-0001 is outside the 95-105% range
ZUV-0003 100% [ 161.98 GB -> 161.29 GB]
ZUV-0004 99% [ 2966.57 GB -> 2935.98 GB]
ZUV-0005 101% [ 3.06 GB -> 3.08 GB]
Ratio for COS-0001 is outside the 95-105% range
Ratio for DBIO-0001 is outside the 95-105% range
Ratio for DMATH-0001 is outside the 95-105% range
#!/bin/bash
# This is a driver for Awk script "./monitor-data-changes.awk",
# which see for detailed explanations.
# Thure Dührsen, 2024-06-11..12 Initial version
if [ "$#" -lt 3 ] || [ "$#" -gt 4 ]; then
echo "Usage: "
echo " $0 file1 file2 strict"
echo " $0 file1 file2 lenient"
echo " $0 file1 file2 strict verbose"
echo " $0 file1 file2 lenient verbose"
echo
echo "where file1 and file2 are CSV files,"
echo "and 'strict', 'lenient', 'verbose' are typed exactly as shown."
exit 2
fi
if [ ! -f "$1" ]; then
echo "First argument is not a file"
exit 2
fi
if [ ! -f "$2" ]; then
echo "Second argument is not a file"
exit 2
fi
if [ "$3" != 'strict' ] && [ "$3" != 'lenient' ]; then
echo "Third argument is neither 'strict' nor 'lenient'"
exit 2
fi
if [ "$#" -eq 4 ] && [ "$4" != 'verbose' ]; then
echo "Fourth argument, if present, must be 'verbose'"
exit 2
fi
awk -F';' -v OFS=';' '
/^#/ {next} # Skip comment lines
/^$/ {next} # Skip empty lines
NF != 3 {
print "need exactly three fields per line" > "/dev/stderr"
exit 1
}
{print $3, $1, $2}
' "$1" "$2" |
awk -f ./monitor-data-changes.awk -v strict="$3" -v verbose="$4" |
sort -t';' -k1,1 -k2,2 -k3,3 |
cut -d';' -f2- |
column -s ';' -t
#!/bin/bash
# This is a driver for Awk script "./monitor-data-changes.awk",
# which see for detailed explanations.
# Thure Dührsen, 2024-06-11..12 Initial version
if [ "$#" -lt 3 ] || [ "$#" -gt 4 ]; then
echo "Usage: "
echo " $0 file1 file2 strict"
echo " $0 file1 file2 lenient"
echo " $0 file1 file2 strict verbose"
echo " $0 file1 file2 lenient verbose"
echo
echo "where file1 and file2 are CSV files,"
echo "and 'strict', 'lenient', 'verbose' are typed exactly as shown."
exit 2
fi
if [ ! -f "$1" ]; then
echo "First argument is not a file"
exit 2
fi
if [ ! -f "$2" ]; then
echo "Second argument is not a file"
exit 2
fi
if [ "$3" != 'strict' ] && [ "$3" != 'lenient' ]; then
echo "Third argument is neither 'strict' nor 'lenient'"
exit 2
fi
if [ "$#" -eq 4 ] && [ "$4" != 'verbose' ]; then
echo "Fourth argument, if present, must be 'verbose'"
exit 2
fi
awk -F';' -v OFS=';' '
/^#/ {next} # Skip comment lines
/^$/ {next} # Skip empty lines
NF != 3 {
print "need exactly three fields per line" > "/dev/stderr"
exit 1
}
{print $3, $1, $2}
' "$1" "$2" |
awk -f ./monitor-data-changes.awk -v strict="$3" -v verbose="$4" |
sort -t';' -k1,1 -k2,2 -k3,3 |
cut -d';' -f2- |
column -s ';' -t
# Thure Dührsen, 2024-06-11..12 Initial version
# 2024-06-22 Ensure consistent ratio calc on unsorted input
# For any number of machines, calculates the ratio of data usage
# between two months for each machine: amount of data in later month
# divided by amount of data in ealier month, expressed as a
# percentage.
# Ensures that data is from exactly two months and that each machine
# has at least one record per month.
# Optionally checks whether there is more than one record per month
# and machine and if so, stops processing the entire file.
# Prints the ratio and flags machines with ratios outside
# the 95-105% range.
# Command-line options (to be used in ./monitor-data-changes.sh):
# -v strict="strict" Enable strict mode: Reject more than one record
# for each (machine, month) pair and stop processing
# the entire file if this happens
# -v strict="lenient" Disable strict mode
#
# -v verbose="verbose" Enable verbose mode: print ratios and sizes for
# all machines
# Preconditions:
# Input is CSV, concatenated from two months, with three fields in each line.
# Semicolon as field delimiter,
# first column is a machine name,
# second column is month and year (MM.YYYY),
# third column is a file size given in GB (floating-point
# number plus the string " GB").
# Exit with an error
function errprint(message) {
print "ERROR: " message > "/dev/stderr"
error_occurred = 1 # global error flag
exit 1 # skip to END block
}
# Warn about bad data, but continue processing
function noteprint(message) {
print "NOTE: " message > "/dev/stderr"
}
BEGIN {
FS = ";"
# Traverse arrays ordered by indices in ascending order compared
# as strings
# https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
PROCINFO["sorted_in"] = "@ind_str_asc"
error_occurred = 0 # Initialize error flag, global
if (strict == "lenient") {strict = 0}
}
{
key = $1 FS $2
if (!seen[key]) {
if ($2 !~ /^(0[1-9]|1[0-2])\.[0-9]{4}$/) {
# TODO: Better check for year range
errprint("Not a valid MM.YYYY: " $2)
}
if ($3 !~ /^[0-9]+(\.[0-9]+)?[[:space:]]*GB$/) {
errprint("Not a valid size: " $3)
}
data[key] = $3
seen[key] = 1
} else {
msg = "Duplicate data for machine " $1 " in month " $2
if (strict) {
errprint(msg)
} else {
noteprint(msg)
}
}
months[$2]++
machines[$1]++
}
END {
# Skip the END block if an error occurred
if (error_occurred) {
exit 1
}
if (length(months) != 2) {
msg="Data across the entire set is not from exactly two months"
errprint(msg)
}
for (machine in machines) {
count = 0
for (month in months) {
if ((machine FS month) in data) {
count++
}
}
if (count != 2) {
msg = "Machine " machine " does not have exactly one record per month"
noteprint(msg)
delete machines[machine]
}
}
allok = 1
for (machine in machines) {
split("", sizes) # Clear sizes array
i = 1
for (month in months) {
sizes[i++] = data[machine FS month]
}
ratio = sizes[2] / sizes[1] * 100
if (verbose) {
printf "1;%s;%9.0f%% [%10.2f GB -> %10.2f GB]\n",
machine, ratio, sizes[1], sizes[2]
}
if (ratio < 95 || ratio > 105) {
allok = 0
print "2;Ratio for " machine "; is outside the 95-105% range"
}
}
print (allok ? "3;Ratio for all machines in 95-105% range" : "")
}
# Thure Dührsen, 2024-06-11..12 Initial version
# 2024-06-22 Ensure consistent ratio calc on unsorted input
# For any number of machines, calculates the ratio of data usage
# between two months for each machine: amount of data in later month
# divided by amount of data in ealier month, expressed as a
# percentage.
# Ensures that data is from exactly two months and that each machine
# has at least one record per month.
# Optionally checks whether there is more than one record per month
# and machine and if so, stops processing the entire file.
# Prints the ratio and flags machines with ratios outside
# the 95-105% range.
# Command-line options (to be used in ./monitor-data-changes.sh):
# -v strict="strict" Enable strict mode: Reject more than one record
# for each (machine, month) pair and stop processing
# the entire file if this happens
# -v strict="lenient" Disable strict mode
#
# -v verbose="verbose" Enable verbose mode: print ratios and sizes for
# all machines
# Preconditions:
# Input is CSV, concatenated from two months, with three fields in each line.
# Semicolon as field delimiter,
# first column is a machine name,
# second column is month and year (MM.YYYY),
# third column is a file size given in GB (floating-point
# number plus the string " GB").
# Exit with an error
function errprint(message) {
print "ERROR: " message > "/dev/stderr"
error_occurred = 1 # global error flag
exit 1 # skip to END block
}
# Warn about bad data, but continue processing
function noteprint(message) {
print "NOTE: " message > "/dev/stderr"
}
BEGIN {
FS = ";"
# Traverse arrays ordered by indices in ascending order compared
# as strings
# https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
PROCINFO["sorted_in"] = "@ind_str_asc"
error_occurred = 0 # Initialize error flag, global
if (strict == "lenient") {strict = 0}
}
{
key = $1 FS $2
if (!seen[key]) {
if ($2 !~ /^(0[1-9]|1[0-2])\.[0-9]{4}$/) {
# TODO: Better check for year range
errprint("Not a valid MM.YYYY: " $2)
}
if ($3 !~ /^[0-9]+(\.[0-9]+)?[[:space:]]*GB$/) {
errprint("Not a valid size: " $3)
}
data[key] = $3
seen[key] = 1
} else {
msg = "Duplicate data for machine " $1 " in month " $2
if (strict) {
errprint(msg)
} else {
noteprint(msg)
}
}
months[$2]++
machines[$1]++
}
END {
# Skip the END block if an error occurred
if (error_occurred) {
exit 1
}
if (length(months) != 2) {
msg="Data across the entire set is not from exactly two months"
errprint(msg)
}
for (machine in machines) {
count = 0
for (month in months) {
if ((machine FS month) in data) {
count++
}
}
if (count != 2) {
msg = "Machine " machine " does not have exactly one record per month"
noteprint(msg)
delete machines[machine]
}
}
allok = 1
for (machine in machines) {
split("", sizes) # Clear sizes array
i = 1
for (month in months) {
sizes[i++] = data[machine FS month]
}
ratio = sizes[2] / sizes[1] * 100
if (verbose) {
printf "1;%s;%9.0f%% [%10.2f GB -> %10.2f GB]\n",
machine, ratio, sizes[1], sizes[2]
}
if (ratio < 95 || ratio > 105) {
allok = 0
print "2;Ratio for " machine "; is outside the 95-105% range"
}
}
print (allok ? "3;Ratio for all machines in 95-105% range" : "")
}