Unix, group rows and sum values of columns from file.csv

Question

I have this file.csv

"201707"|"51976551"|1|0|1|"20170702"
"201707"|"51955194"|1|0|0|"20170702"
"201707"|"51923555"|1|0|1|"20170702"
"201707"|"51976551"|1|0|1|"20170703"
"201707"|"51955194"|1|0|0|"20170703"
"201707"|"51923555"|1|0|1|"20170703"
"201707"|"51960597"|1|0|0|"20170703"

And my hope result is group by the number and sum the column 3, 4 and 5

"201707"|"51976551"|2|0|2
"201707"|"51955194"|2|0|0
"201707"|"51923555"|2|0|2
"201707"|"51960597"|1|0|0

I've tried with:

cat file.csv | awk -F"|" '
  { a[$2] += $3 }
  END {
    for (i in a) {
      printf "%s|%s\n", i, a[i];
    }
  }
'

And the result is:

"51976551"|2
"51955194"|2
"51923555"|2
"51960597"|1

Only shows the sum of third column, but I need 2 columns more. what should I do in this case?

John1024 · Accepted Answer · 2018-01-19 01:11:44Z

4

Try:

$ awk -F"|" '{ a[$1 OFS $2]+=$3; b[$1 OFS $2]+=$4; c[$1 OFS $2]+=$5 }
  END {
    for (i in a) {
      print i, a[i], b[i], c[i];
    }
  }
' OFS=\| file.csv
"201707"|"51976551"|2|0|2
"201707"|"51960597"|1|0|0
"201707"|"51923555"|2|0|2
"201707"|"51955194"|2|0|0

How it works

-F"|"

This sets the field separator on input to |.
a[$1 OFS $2]+=$3; b[$1 OFS $2]+=$4; c[$1 OFS $2]+=$5

This keeps track of the totals of the third, fourth, and fifth columns.
END { for (i in a) { print i, a[i], b[i], c[i]; } }

This prints out the results.
OFS=\|

This tells awk to use | as the field separator on output.

edited Jan 19, 2018 at 1:11

answered Jan 19, 2018 at 0:34

John1024

115k15 gold badges151 silver badges183 bronze badges

Sign up to request clarification or add additional context in comments.

4 Comments

karakfa Over a year ago

better to define key=$1 FS $2 and use it instead.

John1024 Over a year ago

@karakfa Excellent suggestion! Thanks. (Upon reflection on your comment, I suspect OFS is better choice). Answer updated.

karakfa Over a year ago

here doesn't matter since they are the same but in general OFS won't guarantee the composite key will be unique. whereas FS guarantees that.

Diana Ysabel Over a year ago

Thanks John1024 and karafka, FS it's better, it showed correctly the sum of each column! You saved me!

Akshay Hegde · Accepted Answer · 2018-01-19 02:34:18Z

To Preserve order:

By processing in END block

awk  'BEGIN{
            FS=OFS="|"
      }
      {
             k = $1 OFS $2; 
             if(!(k in t)){
                    o[++c]=k; 
                    t[k]
             } 
             for(i=3; i<=5; i++)
                    a[k OFS i]+=$i
       }
    END{
             for(i=1; i in o; i++)
             {
                 printf "%s", o[i]; 
                 for(j=3; j<=5; j++)
                     printf "%s%s", OFS, a[o[i] OFS j]; 
                 print ""
              }
        }
     ' infile

Or by reading same file twice (GNU awk)

awk  'BEGIN{
        FS=OFS="|"
      }
      function ps(f)
      {
           for(i=3;i<=5;i++)
           if(f)
           { 
                   a[k OFS i]+=$i; 
                   t[k] 
           }else 
                   s=(s ? s OFS :"") a[k OFS i]
       }
       {
         k=$1 OFS $2
       }
       FNR==NR{
         ps(1); 
         next
       }
       k in t{
         s=""; 
         ps();  
         print k, s; 
         delete t[k] 
       }
     ' infile infile

Input:

$ cat input
"201707"|"51976551"|1|0|1|"20170702"
"201707"|"51955194"|1|0|0|"20170702"
"201707"|"51923555"|1|0|1|"20170702"
"201707"|"51976551"|1|0|1|"20170703"
"201707"|"51955194"|1|0|0|"20170703"
"201707"|"51923555"|1|0|1|"20170703"
"201707"|"51960597"|1|0|0|"20170703"

Output-1:

$ awk  'BEGIN{FS=OFS="|"}{k = $1 OFS $2; if(!(k in t)){o[++c]=k; t[k]} for(i=3; i<=5; i++)a[k OFS i]+=$i}END{for(i=1; i in o; i++){printf "%s", o[i]; for(j=3; j<=5; j++)printf "%s%s", OFS, a[o[i] OFS j]; print ""}}' infile
"201707"|"51976551"|2|0|2
"201707"|"51955194"|2|0|0
"201707"|"51923555"|2|0|2
"201707"|"51960597"|1|0|0

Output-2:

$ awk  'BEGIN{FS=OFS="|"}function ps(f){for(i=3;i<=5;i++)if(f){ a[k OFS i]+=$i; t[k] }else s=(s ? s OFS :"") a[k OFS i]}{k=$1 OFS $2}FNR==NR{ps(1); next}k in t{s=""; ps();  print k, s; delete t[k] }' infile infile
"201707"|"51976551"|2|0|2
"201707"|"51955194"|2|0|0
"201707"|"51923555"|2|0|2
"201707"|"51960597"|1|0|0

Collectives™ on Stack Overflow

Unix, group rows and sum values of columns from file.csv

2 Answers 2

How it works

4 Comments

Comments

Hot Network Questions

Collectives™ on Stack Overflow

2 Answers 2

How it works

4 Comments

Comments

Related