Skip to main content
Tweeted twitter.com/StackCodeReview/status/946264432690253824
Mod Moved Comments To Chat
added 19 characters in body
Source Link
t3chb0t
  • 44.7k
  • 9
  • 84
  • 190
public interface ICsvParser
{
    IEnumerable<List<string>> Parse(string csv, char separator = ';');
}

public class CsvParser : ICsvParser
{
    public IEnumerable<List<string>> Parse(string csv, char separator = ';')
    {
        if (csv == null) { throw new ArgumentNullException(nameof(csv)); }
        if (string.IsNullOrEmpty(csv)) { yield break; }

        var doubleQuote = '"';
        var carriageReturn = '\r';
        var lineFeed = '\n';
        var eof = false;

        var i = 0;

    resume:

        var isQuote = false;
        var isEscapeSequence = false;
        var isLineBreak = false;

        var buffer = new StringBuilder();
        var line = new List<string>();

        for (; i < csv.Length; i++)
        {
            var current = csv[i];

            if (isLineBreak)
            {
                if (current == lineFeed)
                {
                    i++; // Skip the line-feed.
                    goto yield;
                }

                throw new ArgumentException($"Invalid character at {i}. Expected '\\n' but found '{current}'.");
            }
            else
            {
                if (isEscapeSequence)
                {
                    if (current == doubleQuote)
                    {
                        buffer.Append(current);
                    }
                    else
                    {
                        isQuote = !isQuote;
                        if (current == separator)
                        {
                            line.Add(buffer.ToString());
                            buffer.Clear();
                        }
                        else
                        {
                            buffer.Append(current);
                        }
                    }

                    isEscapeSequence = false;
                }
                else
                {
                    if (current == doubleQuote)
                    {
                        isEscapeSequence = true;
                    }
                    else
                    {
                        if (current == separator && !isQuote)
                        {
                            line.Add(buffer.ToString());
                            buffer.Clear();
                        }
                        else
                        {
                            if (current == carriageReturn)
                            {
                                isLineBreak = true;
                            }
                            else
                            {
                                buffer.Append(current);
                            }
                        }
                    }
                }
            }
        }
        
        eof = true;

    yield:

        // Current buffer is not added yet.
        line.Add(buffer.ToString());

        yield return line;

        var eof = (i == csv.Length);
        if (!eof)
        {
            goto resume;
        }
    }
}
public interface ICsvParser
{
    IEnumerable<List<string>> Parse(string csv, char separator = ';');
}

public class CsvParser : ICsvParser
{
    public IEnumerable<List<string>> Parse(string csv, char separator = ';')
    {
        if (csv == null) { throw new ArgumentNullException(nameof(csv)); }
        if (string.IsNullOrEmpty(csv)) { yield break; }

        var doubleQuote = '"';
        var carriageReturn = '\r';
        var lineFeed = '\n';

        var i = 0;

    resume:

        var isQuote = false;
        var isEscapeSequence = false;
        var isLineBreak = false;

        var buffer = new StringBuilder();
        var line = new List<string>();

        for (; i < csv.Length; i++)
        {
            var current = csv[i];

            if (isLineBreak)
            {
                if (current == lineFeed)
                {
                    i++; // Skip the line-feed.
                    goto yield;
                }

                throw new ArgumentException($"Invalid character at {i}. Expected '\\n' but found '{current}'.");
            }
            else
            {
                if (isEscapeSequence)
                {
                    if (current == doubleQuote)
                    {
                        buffer.Append(current);
                    }
                    else
                    {
                        isQuote = !isQuote;
                        if (current == separator)
                        {
                            line.Add(buffer.ToString());
                            buffer.Clear();
                        }
                        else
                        {
                            buffer.Append(current);
                        }
                    }

                    isEscapeSequence = false;
                }
                else
                {
                    if (current == doubleQuote)
                    {
                        isEscapeSequence = true;
                    }
                    else
                    {
                        if (current == separator && !isQuote)
                        {
                            line.Add(buffer.ToString());
                            buffer.Clear();
                        }
                        else
                        {
                            if (current == carriageReturn)
                            {
                                isLineBreak = true;
                            }
                            else
                            {
                                buffer.Append(current);
                            }
                        }
                    }
                }
            }
        }

    yield:

        // Current buffer is not added yet.
        line.Add(buffer.ToString());

        yield return line;

        var eof = (i == csv.Length);
        if (!eof)
        {
            goto resume;
        }
    }
}
public interface ICsvParser
{
    IEnumerable<List<string>> Parse(string csv, char separator = ';');
}

public class CsvParser : ICsvParser
{
    public IEnumerable<List<string>> Parse(string csv, char separator = ';')
    {
        if (csv == null) { throw new ArgumentNullException(nameof(csv)); }
        if (string.IsNullOrEmpty(csv)) { yield break; }

        var doubleQuote = '"';
        var carriageReturn = '\r';
        var lineFeed = '\n';
        var eof = false;

        var i = 0;

    resume:

        var isQuote = false;
        var isEscapeSequence = false;
        var isLineBreak = false;

        var buffer = new StringBuilder();
        var line = new List<string>();

        for (; i < csv.Length; i++)
        {
            var current = csv[i];

            if (isLineBreak)
            {
                if (current == lineFeed)
                {
                    i++; // Skip the line-feed.
                    goto yield;
                }

                throw new ArgumentException($"Invalid character at {i}. Expected '\\n' but found '{current}'.");
            }
            else
            {
                if (isEscapeSequence)
                {
                    if (current == doubleQuote)
                    {
                        buffer.Append(current);
                    }
                    else
                    {
                        isQuote = !isQuote;
                        if (current == separator)
                        {
                            line.Add(buffer.ToString());
                            buffer.Clear();
                        }
                        else
                        {
                            buffer.Append(current);
                        }
                    }

                    isEscapeSequence = false;
                }
                else
                {
                    if (current == doubleQuote)
                    {
                        isEscapeSequence = true;
                    }
                    else
                    {
                        if (current == separator && !isQuote)
                        {
                            line.Add(buffer.ToString());
                            buffer.Clear();
                        }
                        else
                        {
                            if (current == carriageReturn)
                            {
                                isLineBreak = true;
                            }
                            else
                            {
                                buffer.Append(current);
                            }
                        }
                    }
                }
            }
        }
        
        eof = true;

    yield:

        // Current buffer is not added yet.
        line.Add(buffer.ToString());

        yield return line;

        if (!eof)
        {
            goto resume;
        }
    }
}
Source Link
t3chb0t
  • 44.7k
  • 9
  • 84
  • 190

Parsing RFC 4180 CSV with GOTOs

One of my data-import tools needs to support CSV files. I thought that parsing CSV is such a simple task that I didn't want to use any any external libraries for that. So here is one more RFC 4180 CSV parser. This one however works with two gotos.


I don't preach never use goto because I find there are situations in which it's useful. In this implementation it allows me to reduce code repetition by having only a single yield return and resetting all variables before parsing each line. Without the goto it would require one yiled return inside the loop and another one at the end for the last line. Resetting flags would also need to be done twice - initilization before the loop and then after each line.

The parser does not use any continues and if elses. I find they are confusing so I'd rather nest one more if/else than break the flow multiple times with a continue or seemingly equal conditions.

Everything it needs to be able to do is to parse a CSV into lines and columns. Reading files, verifying equal column count in each line or using headers for DataTables are jobs that other modules will take care of.

The interface might look unnecessary but I need it for dependency injection and mocking/testing.

public interface ICsvParser
{
    IEnumerable<List<string>> Parse(string csv, char separator = ';');
}

public class CsvParser : ICsvParser
{
    public IEnumerable<List<string>> Parse(string csv, char separator = ';')
    {
        if (csv == null) { throw new ArgumentNullException(nameof(csv)); }
        if (string.IsNullOrEmpty(csv)) { yield break; }

        var doubleQuote = '"';
        var carriageReturn = '\r';
        var lineFeed = '\n';

        var i = 0;

    resume:

        var isQuote = false;
        var isEscapeSequence = false;
        var isLineBreak = false;

        var buffer = new StringBuilder();
        var line = new List<string>();

        for (; i < csv.Length; i++)
        {
            var current = csv[i];

            if (isLineBreak)
            {
                if (current == lineFeed)
                {
                    i++; // Skip the line-feed.
                    goto yield;
                }

                throw new ArgumentException($"Invalid character at {i}. Expected '\\n' but found '{current}'.");
            }
            else
            {
                if (isEscapeSequence)
                {
                    if (current == doubleQuote)
                    {
                        buffer.Append(current);
                    }
                    else
                    {
                        isQuote = !isQuote;
                        if (current == separator)
                        {
                            line.Add(buffer.ToString());
                            buffer.Clear();
                        }
                        else
                        {
                            buffer.Append(current);
                        }
                    }

                    isEscapeSequence = false;
                }
                else
                {
                    if (current == doubleQuote)
                    {
                        isEscapeSequence = true;
                    }
                    else
                    {
                        if (current == separator && !isQuote)
                        {
                            line.Add(buffer.ToString());
                            buffer.Clear();
                        }
                        else
                        {
                            if (current == carriageReturn)
                            {
                                isLineBreak = true;
                            }
                            else
                            {
                                buffer.Append(current);
                            }
                        }
                    }
                }
            }
        }

    yield:

        // Current buffer is not added yet.
        line.Add(buffer.ToString());

        yield return line;

        var eof = (i == csv.Length);
        if (!eof)
        {
            goto resume;
        }
    }
}

Example

// test data
var csv = new[]
{
    "foo;bar",
    "baz;qux",
    "\"foo;foo\";qux",
    "foo\"\";\"\"bar",
    "\"foo;\"\"foo\";qux",
    ";",
}
.Join("\r\n"); // my helper extension

var csvParser = new CsvParser();
csvParser.Parse(csv).Dump();
csvParser.Parse("").Dump();

Output:

foo 
bar 

baz 
qux 
 
foo;foo 
qux 
 
foo" 
"bar 
 
foo;"foo 
qux 

<empty>
<empty>

<empty> is just a placeholder I used here to indicate empty strings.