2
votes

I'm trying to join two DataTables on a dynamic number of columns. I've gotten as far as the code below. The problem is the ON statement of the join. How can I make this dynamic based on how many column names are in the list "joinColumnNames".

I was thinking I will need to build some sort of expression tree, but I can't find any examples of how to do this with multiple join columns and with the DataRow object which doesn't have properties for each column.

private DataTable Join(List<string> joinColumnNames, DataTable pullX, DataTable pullY)
{
    DataTable joinedTable = new DataTable();

    // Add all the columns from pullX
    foreach (string colName in joinColumnNames)
    {
        joinedTable.Columns.Add(pullX.Columns[colName]);
    }
    // Add unique columns from PullY
    foreach (DataColumn col in pullY.Columns)
    {
        if (!joinedTable.Columns.Contains((col.ColumnName)))
        {
            joinedTable.Columns.Add(col);
        }
    }

    var Join = (from PX in pullX.AsEnumerable()
                join PY in pullY.AsEnumerable() on 
                // This must be dynamic and join on every column mentioned in joinColumnNames
                new { A = PX[joinColumnNames[0]], B = PX[joinColumnNames[1]] } equals new { A = PY[joinColumnNames[0]], B = PY[joinColumnNames[1]] } 
                into Outer
                from PY in Outer.DefaultIfEmpty<DataRow>(pullY.NewRow())
                select new { PX, PY });

    foreach (var item in Join)
    {
        DataRow newRow = joinedTable.NewRow();
        foreach (DataColumn col in joinedTable.Columns)
        {
            var pullXValue = item.PX.Table.Columns.Contains(col.ColumnName) ? item.PX[col.ColumnName] : string.Empty;
            var pullYValue = item.PY.Table.Columns.Contains(col.ColumnName) ? item.PY[col.ColumnName] : string.Empty;
            newRow[col.ColumnName] = (pullXValue == null || string.IsNullOrEmpty(pullXValue.ToString())) ? pullYValue : pullXValue;
        }
        joinedTable.Rows.Add(newRow);
    }

    return joinedTable;
}

Adding a specific example to show input/output using 3 join columns (Country, Company, and DateId):

Pull X:

Country        Company       DateId    Sales
United States  Test1 Ltd    20160722    $25 

Canada         Test3 Ltd    20160723    $30 

Italy          Test4 Ltd    20160724    $40 

India          Test2 Ltd    20160725    $35 

Pull Y:

Country        Company       DateId    Downloads
United States  Test1 Ltd    20160722    500 

Mexico         Test2 Ltd    20160723    300 

Italy          Test4 Ltd    20160724    900 

Result:

Country        Company       DateId    Sales    Downloads
United States  Test1 Ltd    20160722    $25      500 

Canada         Test3 Ltd    20160723    $30  

Mexico         Test2 Ltd    20160723                300 

Italy          Test4 Ltd    20160724    $40      900 

India          Test2 Ltd    20160725    $35      
2
Can you give a sample input and output? What about the columns from "pullX" that are not in "joinColumnNames"? Don't you want to have these columns in the output?Yacoub Massad
yes, the output should have all columns from x and y. I added a specific example.typheon

2 Answers

1
votes
var Join = 
    from PX in pullX.AsEnumerable()
    join PY in pullY.AsEnumerable()
    on     string.Join("\0", joinColumnNames.Select(c => PX[c]))
    equals string.Join("\0", joinColumnNames.Select(c => PY[c]))
    into Outer
    from PY in Outer.DefaultIfEmpty<DataRow>(pullY.NewRow())
    select new { PX, PY };

Another way is to have both DataTable in a DataSet and use DataRelation
How To: Use DataRelation to perform a join on two DataTables in a DataSet?

1
votes

Since you are using LINQ to Objects, there is no need to use expression trees. You can solve your problem with a custom equality comparer.

Create an equality comparer that can compare equality between two DataRow objects based on the values of specific columns. Here is an example:

public class MyEqualityComparer : IEqualityComparer<DataRow>
{
    private readonly string[] columnNames;

    public MyEqualityComparer(string[] columnNames)
    {
        this.columnNames = columnNames;
    }

    public bool Equals(DataRow x, DataRow y)
    {
        return columnNames.All(cn => x[cn].Equals(y[cn]));
    }

    public int GetHashCode(DataRow obj)
    {
        unchecked
        {
            int hash = 19;
            foreach (var value in columnNames.Select(cn => obj[cn]))
            {
                hash = hash * 31 + value.GetHashCode();
            }
            return hash;
        }
    }
}

Then you can use it to make the join like this:

public class TwoRows
{
    public DataRow Row1 { get; set; }
    public DataRow Row2 { get; set; }
}

private static List<TwoRows> LeftOuterJoin(
    List<string> joinColumnNames,
    DataTable leftTable,
    DataTable rightTable)
{
    return leftTable
        .AsEnumerable()
        .GroupJoin(
            rightTable.AsEnumerable(),
            l => l,
            r => r,
            (l, rlist) => new {LeftValue = l, RightValues = rlist},
            new MyEqualityComparer(joinColumnNames.ToArray()))
        .SelectMany(
            x => x.RightValues.DefaultIfEmpty(rightTable.NewRow()),
            (x, y) => new TwoRows {Row1 = x.LeftValue, Row2 = y})
        .ToList();
}

Please note that I am using method syntax because I don't think that you can use a custom equality comparer otherwise.

Please note that the method does a left outer join, not a full outer join. Based on the example you provided, you seem to want a full outer join. To do this you need to do two left outer joins (see this answer). Here is how the full method would look like:

private static DataTable FullOuterJoin(
    List<string> joinColumnNames,
    DataTable pullX,
    DataTable pullY)
{
    var pullYOtherColumns =
        pullY.Columns
            .Cast<DataColumn>()
            .Where(x => !joinColumnNames.Contains(x.ColumnName))
            .ToList();

    var allColumns = 
        pullX.Columns
            .Cast<DataColumn>()
            .Concat(pullYOtherColumns)
            .ToArray();

    var allColumnsClone =
        allColumns
            .Select(x => new DataColumn(x.ColumnName, x.DataType))
            .ToArray();

    DataTable joinedTable = new DataTable();

    joinedTable.Columns.AddRange(allColumnsClone);

    var first =
        LeftOuterJoin(joinColumnNames, pullX, pullY);

    var resultRows = new List<DataRow>();

    foreach (var item in first)
    {
        DataRow newRow = joinedTable.NewRow();
        foreach (DataColumn col in joinedTable.Columns)
        {
            var value = pullX.Columns.Contains(col.ColumnName)
                ? item.Row1[col.ColumnName]
                : item.Row2[col.ColumnName];

            newRow[col.ColumnName] = value;
        }
        resultRows.Add(newRow);
    }

    var second =
        LeftOuterJoin(joinColumnNames, pullY, pullX);

    foreach (var item in second)
    {
        DataRow newRow = joinedTable.NewRow();
        foreach (DataColumn col in joinedTable.Columns)
        {
            var value = pullY.Columns.Contains(col.ColumnName)
                ? item.Row1[col.ColumnName]
                : item.Row2[col.ColumnName];

            newRow[col.ColumnName] = value;
        }
        resultRows.Add(newRow);
    }

    var uniqueRows =
        resultRows
            .Distinct(
                new MyEqualityComparer(
                    joinedTable.Columns
                        .Cast<DataColumn>()
                        .Select(x => x.ColumnName)
                        .ToArray()));

    foreach (var uniqueRow in uniqueRows)
        joinedTable.Rows.Add(uniqueRow);


    return joinedTable;
}

Please note also how I clone the columns. You cannot use the same column object in two tables.