compare two spreadsheet and output the difference using google app scripts

Question

well, i'm trying to do what described in title. Both spreadsheets have only one sheet that are the ones i'm comparing. One spreadsheet is and update of the other, so i'm trying to get only new content. (if it were a fc (dos command) like function this would be easy...)

After doing some search, i have the folloing script that should work on most cases, that uses arrays for each sheet.

function test() {
  var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId =Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length);  
  var newarray = getNewData(array,old_array);
  Logger.log('there are ' + newarray.length + 'different rows');
}

function getNewData(array1,array2){    
  var diff =array2;   
  for (var i = 0; i<array1.length; i++){
    var duplicate = false;
    for (var j = 0;j<diff.length;j++){
      if (array1[i].join() == diff[j].join()){
        Logger.log('duplicated line found on rows ' + i + ':' + j);
        diff.splice(j,1);
        var duplicate= true;
        break;
      }    
    }
    if (duplicate==false) {
      Logger.log('not duplicated line found on row ' + i);
      diff.push(array1[i]);            
    }
  }
  return diff;
}

The thing is that the files are too big, almost 30000 rows, so the scripts exceed 5 minutes limit for execution.

Is there a way to improve this, like for instance, eliminate the inner for loop? Or there is a way to do it in parts? like first the first 5000 rows, and so on.

Regards,

EDIT: after analizing the spreadsheet a little, i found out that there is a ID for every row, so now i can concentrate the search only in one column of each spreadsheet. So here is my new implementation:

function test(){
var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId =Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length); 
  //The COlumn has an indicator, so i search for that. I don't control the formatting of the files, so i search in both spreadsheet for the indicator
  var searchString = 'NAME';
  for (var i = 0; i < old_array.length; i++) {    
    for (var j = 0; j < old_array[i].length; j++) {    
      if (old_array[i][j] == searchString) {
        var Row_old = i+1;
        var Column_old = j;
        break;
      }      
    }
    if (Row_old != undefined){
      break;
    }
  }
  for (var i = 0; i < array.length; i++) {    
    for (var j = 0; j < array[i].length; j++) {    
      if (array[i][j] == searchString) {
        var Row = i+1;
        var Column = j;
        break;
      }      
    }
    if (Row != undefined){
      break;
    }
  }

  Logger.log(Row_old+':::'+Column_old+'\n'+Row+':::'+Column);  

  var diff_index =[];
  var row_ind = 0;  
  for (var i=Row;i<array.length;i++){        
    Logger.log(i);
    var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]);
    if (existe==-1){      
      Logger.log(row_ind+'!!!');
      diff_index[row_ind]=i;
      row_ind++;          
    }
  }
  Logger.log(diff_index);
}

This still run out of time... I will now try to incorporate your comments.

If this is always run on the same two files, then get the ID out of the file's URL and open the files directly instead of searching the drive for them. — Karl_S
i don't get what you mean by opening directly... But, the files are updated every day with new info (a new file with the new info (sheet) its downloaded/uploaded to drive and the old file gets _old appended to its name, is here where i want to compare them) — kurokirasama
In your code, you are iterating over files and finding the relevant file, instead of doing this it is better to open the spreadsheet using unique id assigned to it. You can find it in the URL when open a spreadsheet — Parag Jadhav
ok. I can't do that because the files get updated (via download) every day (so they are actually different files). I rename them to keep the name, but Id's are changing every day. Besides, it takes almost no time in searchig for the two files, all execution time is in the for loops. — kurokirasama
Check the Best Practices for Spreadsheets. It mentions optimization tips like Use batch operations, Avoid libraries in UI-heavy scripts, and Use the Cache service. — noogui

Robin Gertenbach Robin Gertenbach · Accepted Answer · 2017-06-11T09:58:38

Your script has a few major bottlenecks that slow it down massively:

Starting both loops at 0 every time makes its runtime explode
splicing every time you find a duplicate requires to move the array around
string concatenating an array on every iteration

We can circumvent these issues by:

sorting the second range once
I'm sure there's something clever to be done by iteratively binary searching through every column but we'd have to resort every time so we'll binary search the first column and then do a linear search.

We will use ArrayLib for the sorting (I hope it's a fast sorting algorithm).

Let's start with a function to find the first row where the first column matches a value (the first column of the current row):

function firstRowMatchingCol1(target, lookupRange) {
  var min = 0;
  var max = lookupRange.length - 1;
  var guess;
  var guessVal;

  while(min <= max) {
    guess = (min + max) / 2 | 0;
    guessVal = lookupRange[guess][0];

    if (guessVal < target) {
      min = guess + 1;
    } else if (guessVal > target) {
      max = guess - 1; 
    } else {
      while (guess > 0 && lookupRange[guess - 1][0] === target) {
        guess -= 1; 
      }
      return guess;
    }
  }
  return -1;
}

Now we can go linearly go through every row and check if the columns match until the first column doesn't match anymore.

function matchExists(row, lookupRange) {
  var index = firstRowMatchingCol1(row[0], lookupRange); 
  if (index === -1) {return false;}

  while (index < lookupRange.length && lookupRange[index][0] === row[0]) {
    for (var col = 1; col < row.length; col++) {
      if (row[col] !== lookupRange[index][col]) {break;}
      if (col === row.length - 1) {return true;} // This only works if the ranges are at least two columns wide but if they are one column wide you can just check if index > -1
    }
    index += 1;
  }
  return false;  
}

And finally we can get the duplicates like this:

function getNonDuplicates(r1, r2) {
  r2 = ArrayLib.sort(r2, 0, true);  
  return r1.filter(function(row) {return !matchExists(row, r2);});
}

Like mTorres' code this is untested

compare two spreadsheet and output the difference using google app scripts

4 Answers