6
votes

well, i'm trying to do what described in title. Both spreadsheets have only one sheet that are the ones i'm comparing. One spreadsheet is and update of the other, so i'm trying to get only new content. (if it were a fc (dos command) like function this would be easy...)

After doing some search, i have the folloing script that should work on most cases, that uses arrays for each sheet.

function test() {
  var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId =Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length);  
  var newarray = getNewData(array,old_array);
  Logger.log('there are ' + newarray.length + 'different rows');
}

function getNewData(array1,array2){    
  var diff =array2;   
  for (var i = 0; i<array1.length; i++){
    var duplicate = false;
    for (var j = 0;j<diff.length;j++){
      if (array1[i].join() == diff[j].join()){
        Logger.log('duplicated line found on rows ' + i + ':' + j);
        diff.splice(j,1);
        var duplicate= true;
        break;
      }    
    }
    if (duplicate==false) {
      Logger.log('not duplicated line found on row ' + i);
      diff.push(array1[i]);            
    }
  }
  return diff;
}

The thing is that the files are too big, almost 30000 rows, so the scripts exceed 5 minutes limit for execution.

Is there a way to improve this, like for instance, eliminate the inner for loop? Or there is a way to do it in parts? like first the first 5000 rows, and so on.

Regards,

EDIT: after analizing the spreadsheet a little, i found out that there is a ID for every row, so now i can concentrate the search only in one column of each spreadsheet. So here is my new implementation:

function test(){
var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId =Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length); 
  //The COlumn has an indicator, so i search for that. I don't control the formatting of the files, so i search in both spreadsheet for the indicator
  var searchString = 'NAME';
  for (var i = 0; i < old_array.length; i++) {    
    for (var j = 0; j < old_array[i].length; j++) {    
      if (old_array[i][j] == searchString) {
        var Row_old = i+1;
        var Column_old = j;
        break;
      }      
    }
    if (Row_old != undefined){
      break;
    }
  }
  for (var i = 0; i < array.length; i++) {    
    for (var j = 0; j < array[i].length; j++) {    
      if (array[i][j] == searchString) {
        var Row = i+1;
        var Column = j;
        break;
      }      
    }
    if (Row != undefined){
      break;
    }
  }

  Logger.log(Row_old+':::'+Column_old+'\n'+Row+':::'+Column);  

  var diff_index =[];
  var row_ind = 0;  
  for (var i=Row;i<array.length;i++){        
    Logger.log(i);
    var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]);
    if (existe==-1){      
      Logger.log(row_ind+'!!!');
      diff_index[row_ind]=i;
      row_ind++;          
    }
  }
  Logger.log(diff_index);
}

This still run out of time... I will now try to incorporate your comments.

4
If this is always run on the same two files, then get the ID out of the file's URL and open the files directly instead of searching the drive for them.Karl_S
i don't get what you mean by opening directly... But, the files are updated every day with new info (a new file with the new info (sheet) its downloaded/uploaded to drive and the old file gets _old appended to its name, is here where i want to compare them)kurokirasama
In your code, you are iterating over files and finding the relevant file, instead of doing this it is better to open the spreadsheet using unique id assigned to it. You can find it in the URL when open a spreadsheetParag Jadhav
ok. I can't do that because the files get updated (via download) every day (so they are actually different files). I rename them to keep the name, but Id's are changing every day. Besides, it takes almost no time in searchig for the two files, all execution time is in the for loops.kurokirasama
Check the Best Practices for Spreadsheets. It mentions optimization tips like Use batch operations, Avoid libraries in UI-heavy scripts, and Use the Cache service.noogui

4 Answers

1
votes

Your script has a few major bottlenecks that slow it down massively:

  • Starting both loops at 0 every time makes its runtime explode
  • splicing every time you find a duplicate requires to move the array around
  • string concatenating an array on every iteration

We can circumvent these issues by:

  • sorting the second range once
  • I'm sure there's something clever to be done by iteratively binary searching through every column but we'd have to resort every time so we'll binary search the first column and then do a linear search.

We will use ArrayLib for the sorting (I hope it's a fast sorting algorithm).

Let's start with a function to find the first row where the first column matches a value (the first column of the current row):

function firstRowMatchingCol1(target, lookupRange) {
  var min = 0;
  var max = lookupRange.length - 1;
  var guess;
  var guessVal;

  while(min <= max) {
    guess = (min + max) / 2 | 0;
    guessVal = lookupRange[guess][0];

    if (guessVal < target) {
      min = guess + 1;
    } else if (guessVal > target) {
      max = guess - 1; 
    } else {
      while (guess > 0 && lookupRange[guess - 1][0] === target) {
        guess -= 1; 
      }
      return guess;
    }
  }
  return -1;
}

Now we can go linearly go through every row and check if the columns match until the first column doesn't match anymore.

function matchExists(row, lookupRange) {
  var index = firstRowMatchingCol1(row[0], lookupRange); 
  if (index === -1) {return false;}

  while (index < lookupRange.length && lookupRange[index][0] === row[0]) {
    for (var col = 1; col < row.length; col++) {
      if (row[col] !== lookupRange[index][col]) {break;}
      if (col === row.length - 1) {return true;} // This only works if the ranges are at least two columns wide but if they are one column wide you can just check if index > -1
    }
    index += 1;
  }
  return false;  
}

And finally we can get the duplicates like this:

function getNonDuplicates(r1, r2) {
  r2 = ArrayLib.sort(r2, 0, true);  
  return r1.filter(function(row) {return !matchExists(row, r2);});
}

Like mTorres' code this is untested

0
votes

The solution I'm proposing is a "hack" around the time limit. But if you want a cleaner solution, you could, if possible, reorganize and make your code more efficient by having the arrays ordered somehow.

You don't specify the data inside array1 and array2, if rows had some sort of ID field you could order by this ID and check row i on array1 and row i on array2 instead of comparing every row in array1 with every row in array2 (which is extremely inefficient with 30000 rows).

If your data does not have an ID field to order the rows, then what you could is something based on my proposed solution: add a track for every compared row on array1. When the run reaches the time limit then you run again the function but starting from the last compared row (you would know which was because you'll be tracking the compared rows), and when the second run times out you repeat, and so on.

Every time you run your comparison you ask if it's the first run (or use a boolean - I prefer to ask the user, this way you won't forget to change the boolean), if it's the first run, you delete the tracking column, if it's not the first run, you'll start with the next to last tracked row so basically continuing your script where it ended. I've been using this technique with good results.

In code (untested, so check it out before running it with real data):

/**
 * Only checks if it's the first run and calls the real work function
 */
function test() {
  var firstRun = "yes" === Browser.msgBox("Question", "Is this the first run?", Browser.Buttons.YES_NO);
  doTest(firstRun);
}

/**
 * Gets the data of the 2 spreadsheets and also the starting
 * row
 */
function doTest(firstRun) {
  var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId = Folder.getId();

  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();

  /**
   * Here is the code to create the tracking hability
   */
  var strartFromRow = 0; // 0 because row 1 is array 0 index when you getValues();
  var trackSheet = old_spreadsheet.getSheetByName("Tracking");
  if (trackSheet === null) {
    trackSheet = old_spreadsheet.insertSheet("Tracking");
  }

  if (firstRun) {
    trackSheet.getRange("A:A").clearContent();   // make sure there no row is tracked yet
  }
  else {
    // we have to continue from the previous row, keep in mind you're making the comparison 
    // with array which is 0 based, but sheet is 1 based, but you want the next one so getLasRow()
    // should be the first item to compare on your array
    strartFromRow = trackSheet.getLastRow(); 
  }

  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);

  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length);  

  // when you call the DIFF function, pass the tracking sheet and the start Row
  var newarray = getNewData(array,old_array, trackSheet, startFromRow);

  Logger.log('there are ' + newarray.length + 'different rows');
}

/**
 * Creates a diff array using array1 and array2
 * It marks each element on array1 once it has checked if it's in array2
 */
function getNewData(array1, array2, trackingSheet, startFromRow){
  var logRow = trackingSheet.getLastRow();
  var diff = array2;   
  for (var i = startFromRow; i < array1.length; i++){
    var duplicate = false;
    for (var j = 0; j < diff.length;j++){
      if (array1[i].join() == diff[j].join()){
        Logger.log('duplicated line found on rows ' + i + ':' + j);
        diff.splice(j,1);
        duplicate = true;
        break;
      }    
    }
    if (duplicate === false) {
      Logger.log('not duplicated line found on row ' + i);
      diff.push(array1[i]);            
    }
    trackingSheet.getRange(logRow++, 1).setValue("Checked!");  // Mark i row as checked
  }
  return diff;
}
0
votes

Here's an alternate solution that gets around the time limit. Create a new dedicated spreadsheet along with a custom sidebar. The sidebar will require you to create some HTML that will ultimately be embedded and rendered in an iframe on the client. You can embed pure javascript into the HTML via script tags.

The beauty of this approach is that these scripts will not run server-side but on the client independently of Google Apps Script's server-side environment and are not subject to the 6 minute limit. Moreover, they can also call functions in your Google Script. So one approach would be to have the client-side scripts call a Google Script function to retrieve the requisite data, do all the heavy processing in the client-side scripts, and then send the results back to the server-side script to update the sheet.

Here's a link to setting up a custom sidebar to get you started: https://developers.google.com/apps-script/guides/dialogs#custom_sidebars

0
votes

Finally, i decided to go for the Cache service option, here is the code and i'm testing it to see if i keep with this.

function getNewData() {
  //deleting triggers
  var triggers = ScriptApp.getProjectTriggers();    
  for (var i = 0; i < triggers.length; i++) {
    if (triggers[i].getHandlerFunction()=='getNewData'){
      ScriptApp.deleteTrigger(triggers[i]);
    }
  }  
  //max running time = 5.5 min
  var MAX_RUNNING_TIME = 330000;
  var startTime= (new Date()).getTime();
  //get cache
  var cache = CacheService.getUserCache();
  var downloaded =JSON.parse(cache.get('downloaded'));  
  var compared =JSON.parse(cache.get('compared'));
  //start
  if (downloaded==1 && compared!=1){
    //folder
    var Folder = DriveApp.getFoldersByName('theFolder').next();
    var FolderId = licitacionesFolder.getId();
    //call old_spreadsheet
    var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
    var files = DriveApp.searchFiles(searchFor); 
    var old_file = files.next();  
    var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
    var old_sheet = old_spreadsheet.getSheets()[0];
    var old_array = old_sheet.getDataRange().getValues();
    //call spreadsheet
    var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
    var files = DriveApp.searchFiles(searchFor); 
    var file = files.next();  
    var spreadsheet = SpreadsheetApp.openById(old_file.getId());
    var sheet = spreadsheet.getSheets()[0];
    var array = sheet.getDataRange().getValues();
    Logger.log(array.length+'::'+old_array.length); 
    // Column
    var searchString = 'NAME';
    var RC = getColumn(array,searchString);    
    var Row = RC.Row;
    var Column = RC.Column;
    var RC = getColumn(old_array,searchString);    
    var Row_old = RC.Row;
    var Column_old = RC.Column;    
    Logger.log(Row_old+':::'+Column_old+'\n'+Row+':::'+Column);      
    //compare 
    var diff_index =JSON.parse(cache.get('diff_index'));
    var row_ind =JSON.parse(cache.get('row_ind'));  
    var Roww =JSON.parse(cache.get('Row'));  
    if (diff_index==null){var diff_index = [];}
    if (row_ind==null){var row_ind = 0;}
    if (Roww==null){var Roww = Row;}    
    Logger.log(row_ind+'\n'+Roww);    
    for (var i=Roww;i<array.length;i++){  
      var currTime = (new Date()).getTime();
      if(currTime - startTime >= MAX_RUNNING_TIME){
        Logger.log((currTime - startTime)/(1000*60));
        Logger.log(i+'::'+row_ind);
        cache.putAll({'diff_index': JSON.stringify(diff_index),'row_ind': JSON.stringify(row_ind),'Row': JSON.stringify(i-1)},21600);   
        ScriptApp.newTrigger('getNewData').timeBased().after(2 * 60 * 1000).create();
        return;
      } else {
        Logger.log(i);
        var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]);
        if (existe==-1){      
          Logger.log(row_ind+'!!!');
          diff_index[row_ind]=i;
          row_ind++;          
        }
      }
    }    
    cache.putAll({'diff_index': JSON.stringify(diff_index),'Row': JSON.stringify(Row),'compared': JSON.stringify(1)},21600);
  } else {
    Logger.log('file not downloaded yet or already compared');
  }
}

function getColumn(array,searchString){
  for (var i = 0; i < array.length; i++) {    
    for (var j = 0; j < array[i].length; j++) {    
      if (array[i][j] == searchString) {
        var Row = i+1;
        var Column = j;
        break;
      }      
    }
    if (Row != undefined){
      break;
    }
  }
  return {Row: Row, Column: Column};
}