How to import a CSV which can have a line break in some rows for a certain column?

Question

I have a CSV with records which have break line but I have consider it in same row.

Here is a sample:

• strbarcode,strdescription,strsize,decprice,intcaseqty,deccost,vat,departmentId
• 5015397310361,Yellow/Black Post Complete with Base ,1,0.01,2,0.01,20,18
• 5015397615305,"Hand sanitiser board c/w manual dispenser - 6 image design - Turquoise (300 x 400mm)
• Complete with fixings",1,0.01,0,0.01,20,18
• ,"Barrier cream board c/w with manual dispenser - Hands - Blue (300 x 400mm)
• Complete with fixings",1,0.01,0,0.01,20,18
• ,"Barrier cream board c/w with manual dispenser - Hands - Turquiose (300 x 400mm)
• Complete with fixings",1,0.01,0,0.01,20,18
• ,"Barrier cream board c/w with manual dispenser - 3 image design - Blue (300 x 400mm)
• Complete with fixings",1,0.01,0,0.01,20,18
• 5015397310354,Red/White Post Complete with Base ,1,0.01,2,0.01,20,18

Here is my current code:

private void ImportProductDataFile(string fileName)
{
    try
    {
        pictureBox_loading.Visible = true;
        pictureBox_loading.Refresh();
        Cursor.Current = Cursors.WaitCursor;
        StreamReader sr = new StreamReader(fileName);
        string[] headers = sr.ReadLine().Split(',');
        DataTable dt = new DataTable();
        DataTable dtnew = new DataTable();
        foreach (string header in headers)
        {
            dt.Columns.Add(header);
            dtnew.Columns.Add(header);
        }
        while (!sr.EndOfStream)
        {
            string[] rows = Regex.Split(sr.ReadLine(), ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
            DataRow dr = dt.NewRow();
            for (int i = 0; i < headers.Length; i++)
            {
                if (rows.Length > 0 && i >= (rows.Length))
                {
                    dr[i] = "";
                }
                else
                {
                    dr[i] = rows[i];
                }
            }
            dt.Rows.Add(dr);
        }
        dt.AcceptChanges();
        var emptyRows = dt.Select().Where(
               row => dt.Columns.Cast<DataColumn>().All(column => string.IsNullOrEmpty(row[column].ToString()))).ToArray();
        Array.ForEach(emptyRows, x => x.Delete());
        dt.AcceptChanges();
        if (dt.Columns.Contains("departmentId"))
        {
            dt.Columns.Remove("departmentId");
            dt.AcceptChanges();
        }
        if (dtnew.Columns.Contains("departmentId"))
        {
            dtnew.Columns.Remove("departmentId");
            dtnew.AcceptChanges();
        }
        //dt = dt.AsEnumerable().GroupBy(r => new { strbarcode = r.Field<string>("strbarcode")}).Select(g => g.Last()).CopyToDataTable();
        dt = dt.AsEnumerable()
             .GroupBy(r => r.Field<string>("strbarcode").Trim(),
             (key, g) => g.OrderByDescending(
                 y =>
                 {
                     try
                     {
                         return Convert.ToDecimal(y.Field<string>("decprice"));
                     }
                     catch (Exception)
                     {
                         return Decimal.Zero;
                     }
                 }
             ).First()).CopyToDataTable();
        dt.AcceptChanges();
        //add data dt to dtnew 
        DataRow Rownew;
        foreach (DataRow row in dt.Rows)//dt  as sourcetable and dt new as destination  table
        {
            Rownew = dtnew.NewRow();
            //if (row["strbarcode"].ToString().Trim() == "")
            //{
            //    continue;
            //}
            Rownew["strbarcode"] = row["strbarcode"].ToString().Trim() != "" ? row["strbarcode"].ToString() : "";
            Rownew["strdescription"] = row["strdescription"].ToString().Trim() != "" ? row["strdescription"].ToString().Replace(",", "") : "";
            Rownew["strsize"] = row["strsize"].ToString() != "" ? row["strsize"].ToString() : "";
            Rownew["decprice"] = row["decprice"].ToString().Trim() != "" ? objUtility.ToDecimal(row["decprice"].ToString()) : 0;
            Rownew["intcaseqty"] = row["intcaseqty"].ToString().Trim() != "" ? objUtility.ToInt(row["intcaseqty"].ToString()) : 0;
            Rownew["deccost"] = row["deccost"].ToString().Trim() != "" ? objUtility.ToDecimal(row["deccost"].ToString()) : 0;
            Rownew["vat"] = row["vat"].ToString().Trim() != "" ? objUtility.ToDecimal(row["vat"].ToString()) : 0;
            dtnew.Rows.Add(Rownew);
        }
        dtnew.AcceptChanges();
        sr.Close();
        //create use type table and stored procedure 
        string strStatus = OjbDataAccess.UpdateDBForImportProductSP();
        if (strStatus == "success")
        {
            //inserting and updating data in plof database 
            string strMsg = OjbDataAccess.ImportProdcttosuperplofInsertUpdate(dtnew);
            pictureBox_loading.Visible = false;
            Cursor.Current = Cursors.Default;
            MessageBox.Show(strMsg, "Data Import Status", MessageBoxButtons.OK, MessageBoxIcon.Information);
        }
        else
        {
            pictureBox_loading.Visible = false;
            Cursor.Current = Cursors.Default;
            MessageBox.Show(strStatus, "Data Import Status", MessageBoxButtons.OK, MessageBoxIcon.Information);
        }
    }
    catch (Exception ex)
    {
        pictureBox_loading.Visible = false;
        Cursor.Current = Cursors.Default;
        txtFile.Text = string.Empty;
        btnImportFile.Enabled = true;
        MessageBox.Show("Please enter valid CSV file");
        WriteErrorLog("=============================================================================" + Environment.NewLine);
        WriteErrorLog("Product CSV file is not imported  due to some error in import file." + Environment.NewLine);
        WriteErrorLog(ex.ToString() + Environment.NewLine);
        WriteErrorLog("=============================================================================" + Environment.NewLine);
        return;
    }
    this.Dispose();
}

Maybe use a standards-compliant CSV library that knows how to deal with quoting instead of rolling your own? — AKX
Instead of inventing the wheel unless you want to make one, take a look at FileHelpers • CsvHelper • FlatFiles • NugetMustHaves:CSV — Olivier Rogier
If you absolutely want to implement your own CSV reader, I suggest using tokens instead of Regex. — Sebastian Grunow
Does this answer your question? Line breaks in generated csv file driving me crazy and Read CSV with new line character inside cells and Read CSV that contained newline — Olivier Rogier

Michael Michael · Accepted Answer · 2021-08-06T11:04:09

If the structure of your text file is always the same, a single regular expression can parse the file:

    Regex lineExp = new Regex(@"(?<strbarcode>[0-9]+)?,((?<strdescription>[^,""]+)|""(?<strdescription>[^""]+)""),(?<strsize>[0-9]+),(?<decprice>[0-9.]+),(?<intcaseqty>[0-9]+),(?<deccost>[0-9.]+),(?<vat>[0-9]+),(?<departmentId>[0-9]+)([\r\n]+|$)", RegexOptions.Singleline | RegexOptions.Compiled);
    Regex lineBreakExp = new Regex(@"[\r\n+]", RegexOptions.Singleline | RegexOptions.Compiled);

    string data = @"strbarcode,strdescription,strsize,decprice,intcaseqty,deccost,vat,departmentId
5015397310361,Yellow/Black Post Complete with Base ,1,0.01,2,0.01,20,18
5015397615305,""Hand sanitiser board c/w manual dispenser - 6 image design - Turquoise (300 x 400mm)
Complete with fixings"",1,0.01,0,0.01,20,18
,""Barrier cream board c/w with manual dispenser - Hands - Blue (300 x 400mm)
Complete with fixings"",1,0.01,0,0.01,20,18
,""Barrier cream board c/w with manual dispenser - Hands - Turquiose (300 x 400mm)
Complete with fixings"",1,0.01,0,0.01,20,18
,""Barrier cream board c/w with manual dispenser - 3 image design - Blue (300 x 400mm)
Complete with fixings"",1,0.01,0,0.01,20,18
5015397310354,Red/White Post Complete with Base ,1,0.01,2,0.01,20,18
5015397310361,""YProduct with delimiter, so take care"",1,0.01,2,0.01,20,18
5015397310361,Product with incalid strsize will be ignored,a,0.01,2,0.01,20,18
5015397310361,Last line ends not with CRLF,3,0.01,2,0.01,20,18";

    var matches = lineExp.Matches(data);
    int i = 1;
    foreach (Match m in matches)
    {
        Console.Write($"Item {i++}\t");
        Console.Write($"{m.Groups["strbarcode"].Value}\t");
        Console.Write($"{lineBreakExp.Replace(m.Groups["strdescription"].Value, " ")}\t");
        Console.Write($"{m.Groups["strsize"].Value}\t");
        Console.Write($"{m.Groups["decprice"].Value}\t");
        Console.Write($"{m.Groups["intcaseqty"].Value}\t");
        Console.Write($"{m.Groups["deccost"].Value}\t");
        Console.Write($"{m.Groups["vat"].Value}\t");
        Console.WriteLine(m.Groups["departmentId"].Value);
    }

Output:

Item 1  5015397310361   Yellow/Black Post Complete with Base    1   0.01    2   0.01    20  18
Item 2  5015397615305   Hand sanitiser board c/w manual dispenser - 6 image design - Turquoise (300 x 400mm)  Complete with fixings 1   0.01    0   0.01    20  18
Item 3      Barrier cream board c/w with manual dispenser - Hands - Blue (300 x 400mm)  Complete with fixings   1   0.01    0   0.01    20  18
Item 4      Barrier cream board c/w with manual dispenser - Hands - Turquiose (300 x 400mm)  Complete with fixings  1   0.01    0   0.01    20  18
Item 5      Barrier cream board c/w with manual dispenser - 3 image design - Blue (300 x 400mm)  Complete with fixings  1   0.01    0   0.01    20  18
Item 6  5015397310354   Red/White Post Complete with Base   1   0.01    2   0.01    20  18
Item 7  5015397310361   YProduct with delimiter, so take care   1   0.01    2   0.01    20  18
Item 8  5015397310361   Last line ends not with CRLF    3   0.01    2   0.01    20  18

How to import a CSV which can have a line break in some rows for a certain column?

2 Answers