4
\$\begingroup\$

This program is an expansion of the sorting utility built in previous chapters of K&R's 2nd edition of ANSI-C Programming. It parses command-line arguments to determine which fields to sort by and what options to apply to each field. For example, a command like ./sort -k2n -k1dr would mean "sort primarily by the 2nd field numerically, and for lines where the 2nd field is equal, sort by the 1st field in reverse directory order."

I'm a beginner working through K&R to improve my C programming skills and could use a review of my overall approach and implementation. Specifically, I have a few areas I'd like feedback on:

Argument Parsing: Is my approach to parsing the -k options in main() robust and easy to understand? It feels a bit complex, and I'm wondering if there's a cleaner way to handle it.

Field Extraction and Comparison: The core logic is in master_compare and extract_field. I'm temporarily modifying the input strings by inserting a '\0' to delimit a field for comparison, and then restoring the original character. Is this a safe and efficient practice, or is there a better way to handle field comparisons without modifying the strings?

Overall Structure: Does the program flow make sense? Are there any obvious bugs, edge cases, or violations of C best practices that I've missed?

Readability and Style: Is the code clear and maintainable? Any suggestions for improving variable names, comments, or function design would be very helpful.

Here is the code:

#include <ctype.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/*
Exercise 5-17: Add a field-searching capability, so sorting may be done on fields within lines, each
field sorted according to an independent set of options.
*/

#define MAXLINES 5000 /* max #lines to be sorted */
#define MAXLEN 1000 /* max length of any input line */
#define ALLOCSIZE 10000 /* size of available space */

#define OPT_REVERSE (1 << 0)    // 0b0001
#define OPT_NUMERIC (1 << 1)    // 0b0010
#define OPT_DIRECTORY (1 << 2)  // 0b0100
#define OPT_FOLD (1 << 3)       // 0b1000

#define MAX_FIELDS 1024         // max number of possible fields to add sorting conditions to
#define MAX_SORT_FIELDS 10      // max number of sorting rules we can handle

char *lineptr[MAXLINES]; /* pointers to text lines */

typedef struct {
    int field_num;
    uint8_t options;
} FIeldSort;

FIeldSort field_sorts[MAX_FIELDS] = {0};
int field_index = 0;
int field_count = 0;

void my_qsort(void *lineptr[], int left, int right, int (*comp)(void *, void *));
void swap(void *v[], int i, int j);
int readlines(char *lineptr[], char *buf);
void writelines(char *lineptr[], int nlines);
int getline(char *, int);

int my_strcmp(char *s, char *t);
int strcmp_insensitive(char *s, char *t);
int strcmp_reverse(char *s, char *t);

int numcmp(char *, char *);
int numcmp_reverse(char *s, char *t);

int dir_cmp(char *s, char *t);
int dircmp_insensitive(char *s, char *t);
int dir_cmp_helper(char *s, char *t, int sensitive);

int master_compare(char *lineA, char *lineB);
void extract_field(char **start, char **end, int n);

/* sort input lines */
int main(int argc, char *argv[])
{
    int nlines;             /* number of input lines read */
    char buf[ALLOCSIZE] = {'\0'};
    int order = 1;
    
    // holds options for each field
    uint8_t field_options = 0;

    /* COMMAND LINE ARG PARSING */
    if (argc == 0)
        printf("no arguments... basic line sort...?\n");
    else if (argc > 1) {
        /* parse command line for field options */
        while (++argv && *argv != NULL) {
            if (strncmp(*argv, "-k",2) == 0) {
                int temp = 0;
                char *p = *argv + 2;
                while (isdigit(*p)) {
                    temp = (temp * 10) + (*p - '0');
                    p++;
                }

                field_sorts[field_index].field_num = temp;

                int max_fields = 4;
                while (*p != '\0' && isalpha(*p) && max_fields-- > 0) {
                    switch (*p) {
                        case 'r':
                            field_sorts[field_index].options |= OPT_REVERSE;
                            break;
                        case 'n':
                            field_sorts[field_index].options |= OPT_NUMERIC;
                            break;
                        case 'd':
                            field_sorts[field_index].options |= OPT_DIRECTORY;
                            break;
                        case 'f':
                            field_sorts[field_index].options |= OPT_FOLD;
                            break;
                        default:
                            printf("field option not recognized... expected 'r', 'n', 'd', or 'f'...\n");
                            return 1;
                    }
                    p++;
                }
                field_index++;
            } else {
                printf("incorrect format... expected '-k'...\n");
                return 1;
            }
        }
    }
    
    int (*comparison_function)(void *, void*) = (int (*)(void*, void*))master_compare;

    if ((nlines = readlines(lineptr, buf)) > 0) {
        my_qsort((void**) lineptr, 0, nlines-1, comparison_function);
        writelines(lineptr, nlines);
        return 0;
    } else {
        printf("error: no lines read\n");
        return 1;
    }
}

/* qsort: sort v[left]....v[right] into increasing order */
void my_qsort(void *v[], int left, int right, int (*comp)(void *, void *)) {

    if (left >= right)  /* do nothing if the array contains nothing */
        return;
    swap(v, left, (left+right)/2);
    int last = left;
    for (int i = left+1; i <= right; i++)
        if ((*comp)(v[i], v[left]) < 0)
            swap(v, ++last, i);
    swap(v, left, last);
    my_qsort(v, left, last-1, comp);
    my_qsort(v, last+1, right, comp);
}

/* numcmp: compare s1 and s2 numerically */
int numcmp(char *s1, char *s2) {
    double v1, v2;

    v1 = atof(s1);
    v2 = atof(s2);
    if (v1 < v2)
        return -1;
    else if (v1 > v2)
        return 1;
    else
        return 0;
}

void swap(void *v[], int i, int j) {
    void *temp;

    temp = v[i];
    v[i] = v[j];
    v[j] = temp;
}

/* readlines: read input lines */
int readlines(char *lineptr[], char *buf) {
    int len, nlines, nchars;
    char *b = buf;
    char *bufmax = buf + ALLOCSIZE, line[MAXLEN];

    nlines = 0; nchars = 0;
    while ((len = getline(line, MAXLEN)) > 0) {
        line[len-1] = '\0'; /* delete newline */
        strcpy(buf, line);
        lineptr[nlines++] = buf;
        buf+= len;
        if (nlines >= MAXLINES)
            return -1;
    }
    return nlines;
}

/* writelines: write output lines */
void writelines(char *lineptr[], int nlines) {
    while (nlines-- > 0) {
        printf("%s\n", *lineptr++);
        fflush(stdout);
    }
}

int getline(char *s, int lim) {
    int c, i;
    i = 0;
    while (--lim > 0 && (c=getchar()) != EOF && c != '\n')
        *(s + i++) = c;
    if (c == '\n')
        *(s+i++) = c;

    *(s+i) = '\0';
    return i;
}

/* strcmp: return <0 if s<t, 0 if s==t, >0 if s>t */
int my_strcmp(char *s, char *t) {
    for ( ; *s == *t; s++, t++)
        if (*s == '\0')
            return 0;
    return *s - *t;
}

int master_compare(char *lineA, char *lineB) {
    int result;
    for (int i = 0; i < field_index; i++) {
        /* pointers are reset to the start of the line for each new field rule */
        char *fieldA = lineA, *fieldA_end, *fieldB = lineB, *fieldB_end;

        /* pass the ADDRESS of the pointers so the function can change them */
        extract_field(&fieldA, &fieldA_end, i);
        extract_field(&fieldB, &fieldB_end, i);

        /* temporarily terminate the fields to treat the as separate strings */
        char A_original = *fieldA_end;
        char B_original = *fieldB_end;
        *fieldA_end = '\0';
        *fieldB_end = '\0';

        // !! ready to do comparison !!
        uint8_t current_options = field_sorts[i].options;

        int numeric = 0, order = 1, fold = 0, directory = 0;

        if (current_options & OPT_FOLD)
            fold = 1;
        if (current_options & OPT_NUMERIC)
            numeric = 1;
        if (current_options & OPT_DIRECTORY)
            directory = 1;
        if (current_options & OPT_REVERSE)
            order = -1;

        if (numeric)
            result = numcmp(fieldA, fieldB);
        else if (directory & fold)
            result = dircmp_insensitive(fieldA, fieldB);
        else if (directory)
            result = dir_cmp(fieldA, fieldB);
        else if (fold)
            result = strcmp_insensitive(fieldA, fieldB);
        else
            result = my_strcmp(fieldA, fieldB);

        /* remove temporary null terminator */
        *fieldA_end = A_original;
        *fieldB_end = B_original;

        /* if a difference was found, return the result immediately */
        if (result != 0)
            return order*result;
    }
    /* if all fields were equal, return 0 */
    return 0;
 }

void extract_field(char **start, char **end, int n) {
    /* create a local copy to work with */
    char *p = *start;
    int field_no = 0;

    while (*p != '\0') {

        /* move to the start of the next field */
        while (isblank(*p))
            p++;

        /* if we're at the end of the string, stop */
        if (*p == '\0')
            break;

        field_no++;

        /* if we found our target field, break the search */
        if (field_no == field_sorts[n].field_num)
            break;

        /* otherwise, skip this field and continue searching */
        while (!isblank(*p) && *p != '\0')
            p++;
    }

    // update the caller's pointers with the results
    *start = p;
    *end = p;
    while (!isblank(**end) && **end != '\0')
        (*end)++;
}

int strcmp_insensitive(char *s, char *t) {
    for ( ; tolower(*s) == tolower(*t); s++, t++)
        if (tolower(*s) == '\0')
            return 0;
    return tolower(*s) - tolower(*t);
}

int dir_cmp(char *s, char *t) {
    return dir_cmp_helper(s, t, 0);
}

int dircmp_insensitive(char *s, char *t) {
    return dir_cmp_helper(s, t, 1);
}

int dir_cmp_helper(char *s, char *t, int sensitive) {
    /* SKIP TO ALPHANUMERIC CHARACTER */
    while(*s != '\0' && !isalnum(*s) && *s != ' ')
        s++;
    while(*t != '\0' && !isalnum(*t) && *t != ' ')
        t++;

    /* COMPARE CHARACTER BY CHARACTER */
    while (*s != '\0' && *t != '\0') {
        char x, y;
        if (sensitive) {
            x = tolower(*s);
            y = tolower(*t);

            if(x != y)
                return x - y;            /* RETURN FINAL COMPARISON */
        } else {
            if(*s != *t)
                return *s - *t;            /* RETURN FINAL COMPARISON */
        }

        s++; t++;
        /* CONTINUE COMPARISON, SKIP TO ALPHANUMERIC CHAR */
        while(*s != '\0' && !isalnum(*s) && *s != ' ')
            s++;
        while(*t != '\0' && !isalnum(*t) && *t != ' ')
            t++;
    }

    /* REACHED END OF AT LEAST ONE STRING */
    if (*s == '\0' && *t == '\0')
        return 0;
    else if (*s == '\0')
        return -1;
    else
        return 1;
}
\$\endgroup\$
4
  • \$\begingroup\$ Pardon my ignorance: What exactly is “directory order”? \$\endgroup\$ Commented Sep 18 at 8:27
  • \$\begingroup\$ @MartinR According to and for this question same as sort -d. See e.g. man7 (with blank possibly meaning whitespace). \$\endgroup\$ Commented Sep 18 at 8:49
  • \$\begingroup\$ Note well that even the 2nd edition of K&R is primarily of historic value at this point. It describes a version of C pretty close to the 1990 edition of the ISO standard (C90), which is long obsolete. I do not recommend using it as textbook for learning C. There have been four revisions of C published since then, the latest in 2024. The newer ones are not wholly backwards compatible, having removed support for some of the more troublesome provisions of early C. \$\endgroup\$ Commented Sep 18 at 15:27
  • \$\begingroup\$ @JohnBollinger Thank you for the heads up. Would you be able to recommend more current resources? I've seen Modern C by Jens Gustedt and Beej's Guide to C get bought up. Have you come across these? \$\endgroup\$ Commented Sep 23 at 14:04

3 Answers 3

6
\$\begingroup\$

I don't have time for a full review, but there's a couple of possible issues to address.

One is that the functions from <ctype.h> don't accept negative values other than EOF. That means that plain char arguments should be cast to unsigned char before (implicit) conversion to int. I recommend changing the comparison functions (including master_compare()) to accept unsigned char const *s, unsigned char const *t (it's easier to use unsigned char throughout that function than to add casts to every isalnum() and tolower().

Another is that there's no explanation of why we need my_strcmp() and what it does differently to standard strcmp(). I note that it unnecessarily requires its arguments to point to non-const strings, which makes it less useful than the standard function.

Similarly, I don't see why my_qsort can't simply call standard qsort() function.

Error handling: if input exceeds our built-in limits MAXLINES or MAXLEN then we produce incorrect output; it would be better to issue warning message to stderr and immediately terminate with EXIT_FAILURE status. Even worse, if input exceeds ALLOC_SIZE then we run into Undefined Behaviour. If reading input fails before we reach end-of-file, that should be treated similarly.

Minor: includes are almost sorted, but <stdlib.h> has been put after <string.h>.

\$\endgroup\$
2
  • \$\begingroup\$ Error handling: strcpy() can return a null pointer strcpy() returns the first argument. But if the first argument is NULL, it will violate the constraint "in all cases a char * or void * argument points to the initial (lowest addressed) character of the array" and invoke undefined behavior. \$\endgroup\$ Commented Sep 18 at 19:08
  • 1
    \$\begingroup\$ @Andrew - brain fart by me - I was looking for where storage is allocated and misread that as strdup(). Looking again, it seems that storage is in local variable buf in main(); that introduces problems of its own, since it may exceed local variable capacity - and readlines() doesn't actually prevent (or even detect) overflow. \$\endgroup\$ Commented Sep 19 at 7:58
4
\$\begingroup\$

A point not mentioned yet is that you're using atof in numcmp. This function gives you no ability whatsoever to do any form of error checking and know whether it succeeded or not. If it didn't succeed you're now in undefined behavior territory using those variables.

You'd be better served by using strtod.

Relevant reading on Stack Overflow: Is there any difference in the way atof and strtod work?

Now, it's possible that you've done error checking at the point where you're calling numcmp to ensure both of its arguments can be parsed to valid floating point numbers, but if you've done this, there's no point in duplicating your effort by sending numcmp strings rather than the floating point numbers in those strings.

\$\endgroup\$
1
  • 1
    \$\begingroup\$ Note that both strtod() and atof() also have cross-platform inconsistencies when the significant digit count is large or the value is near the min/max magnitude. See C23 § 7.24.1.5 10-13. Yes, "better served by using strtod" is true yet portable conversion remains challenging. \$\endgroup\$ Commented Sep 23 at 14:57
3
\$\begingroup\$

Review of helper function: my_strcmp():

Differences between OP's my_strcmp() and standard library strcmp():

  • Common: When *s and *t differ in sign.

    When a char is negative and the other positive (or zero), OP's my_strcmp() does a signed compare. strcmp() compares char as if they were unsigned char.

    For all functions in this subclause, each character shall be interpreted as if it had the type unsigned char (and therefore every possible object representation is valid and has a different value).

  • Rare: when char width is as wide as int.

    *s - *t risk overflow and so undefined behavior. Idiomatic to use 2 compares:

      // return *s - *t;
      return (*s > *t) - (*s < *t);
    
  • Minor: strcmp() allows calling with const char * arguments, unlike OP's function.

  • Historic: Non-2's complement my_strcmp() accesses negative values incorrectly when an unsigned access and compared is specified. Problems include too early a string test as -0 is not a null character, accessing a trap value and incorrect compare. Best to access and compare as if unsigned char pointers.


Candidate alternative (if one desires their own similar function, else use the library one):

int my_strcmp_alt(const char *s, const char *t) {
    const unsigned char *us = (const unsigned char *) s;
    const unsigned char *ut = (const unsigned char *) t;
    while (*us == *ut && *us) {
      us++;
      ut++;
    }
    return (*us > *ut) - (*us < *ut);
}
\$\endgroup\$

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.