0
votes

I have an HTML saved in a cell on the google sheet. Now I would like to extract element values from it. Can anyone please guide?

Here is the sample HTML that I am working with:

<div class="test"><a href="/this-is-page-url" class="cc_a_a"><div data-react-toolbox="card" class="new_test"><div style="background-image:url(&#x27;https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png&#x27;)" class="new_class" title="this is image"><div class="last"></div></div><div class="new_2"><div class="title_test"><div class="card_title">Title Goes Here</div></div></div><div class="for_text"><p>test goes here</p></div><div class="for_date"><p>Jan 1, 2020</p></div></div></a></div>

I would like to extract:

  • a href src value
  • image background url
  • Title
  • Text
  • Date (another text)

Sample code that I am trying to extract href value. No idea how I can do other element unfortunately.

var variable_for_cell_with_HTML = "MY_HTML_GOES_HERE_FROM_ABOVE";
 var myurl = variable_for_cell_with_HTML;
var doc = document.createElement("html");
doc.innerHTML = rawHTML;
var links = doc.getElementsByTagName("a")
var urls = [];

for (var i=0; i<links.length; i++) {

  SpreadsheetApp.getActive().getSheetByName('mysheet').getRange('B7').setValue(urls.push(links[i].getAttribute("href")));
}

Getting ERROR

ReferenceError: document is not defined
2

2 Answers

1
votes

If you're trying to extract specific HTML elements from a given URL, you can follow this general format:

=importxml(A8,"//div[@class='class of desired div']//h3[@class='class of desired h3 element']")

Where A8 is a cell with the web link to the HTML, and where the div or h3 are the tags encompassing your desired result from the page. This is just one example extracting a specific h3 from a specific div, but you could leave off the [@class==] stuff to just return all the h3 elements within the prior div.

I'm sure this could be applied to your specific case as well.

0
votes

It's only html when after it's loaded into the browser. Before that it's just a string. Use standard javascript string methods

something like this regex will get you close to the href: url:

/href="([^"]{1,})"/g 

this is will get you close to the background url:

https:\/\/[^&]{1,}

Regex Tester

This is the html file for my regex tester. I wrote it a long time ago so it's probably a bit neophyte....ish?

<!DOCTYPE html>
<html>
  <head>
    <base target="_top">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
  </head>
  <script>
  $(function(){
    google.script.run
    .withSuccessHandler(function(rObj){
      $('#text').val(rObj.text);
      $('#pattern').val(rObj.pattern);
      $('#results').css('background','white');
      if(rObj.g.toLowerCase()=='yes'){$('#set_g').prop('checked',true);}else{$('#set_g').prop('checked',false);}
      if(rObj.i.toLowerCase()=='yes'){$('#set_i').prop('checked',true);}else{$('#set_i').prop('checked',false);}
      if(rObj.m.toLowerCase()=='yes'){$('#set_m').prop('checked',true);}else{$('#set_m').prop('checked',false);}
    })
    .getLastTextPatternFlags();
  });

  function findData(){
    $('#results').css('background','yellow');
    $('#results').val('');
    var text=$('#text').val();
    var pattern=$('#pattern').val();
    var flags=getFlags();
    try{
      var regex=new RegExp(pattern,flags);
    }
    catch(e){
      console.error(e);
      $('#results').css('background','white');//This is test very much you should come back and look at this.
      $('#results').val('Check Error in Console Log');
    }
    //var result=regex.exec(text);
    result=text.match(regex);

    if(result){
      var rsltLog='';
      for(var i=0;i<result.length;i++){
        if(i>0){rsltLog+='\n'};
        rsltLog+='result[' + i + ']= ' + result[i];
      }
    }
    console.log('module: %s pattern: %s regex: %s flags: %s result: %s length: %s','findData()',pattern,regex,flags,rsltLog,result.length);
    try{
      if(result){
        $('#results').val(rsltLog);
      }else{
        $('#results').val("No Results");
      }
    }
      catch(e){
        console.error(e);
      }
      $('#results').css('background','white');
    }   

    function getFlags(){
      var g=$('#set_g').is(':checked');
      var i=$('#set_i').is(':checked');
      var m=$('#set_m').is(':checked');
      var flagsA=[];
      if(g){flagsA.push('g');}
      if(i){flagsA.push('i');}
      if(m){flagsA.push('m');}
      return flagsA.join('');
    }

    function saveText(){
      $('#text').css('background','yellow');
      var txt=$('#text').val();
      google.script.run
      .withSuccessHandler(function(){
        $('#text').css('background','white');
      })
      .saveText(txt);
      }

      function savePattern(){
      $('#pattern').css('background','yellow');
      var txt=$('#pattern').val();
      google.script.run
      .withSuccessHandler(function(){
        $('#pattern').css('background','white');
      })
      .savePattern(txt);
      }

      function saveFlags(){
        $('#results').css('background','yellow');
        var g=$('#set_g').is(':checked');
        var i=$('#set_i').is(':checked');
        var m=$('#set_m').is(':checked');
        var flagObj={g:'no',i:'no',m:'no'};
        if(g){flagObj.g='yes';}
        if(i){flagObj.i='yes';}
        if(m){flagObj.m='yes';}
        google.script.run
        .withSuccessHandler(function(){
          $('#results').css('background','white');
        })
        .saveFlags(flagObj);
      }
      console.log('My Code');
    </script>
    <style>
    .btns{margin:2px 2px 2px 0;}
    #container{width:100%;}
    </style>
  <body>
    <div id='container'>
    TEXT&nbsp;&nbsp;<input class="btns" type="button" value="Save Text" onClick="saveText();" />
    <br /><textarea id="text" placeholder="Enter the text to be searched" rows="4" cols="60"></textarea>
    <br />PATTERN&nbsp;&nbsp;<input class="btns" type="button" value="Save Pattern" onClick="savePattern();" />
    <br /><textarea id="pattern" placeholder="Enter the regex search expression" rows="4" cols="60"></textarea>
    <br />RESULTS
    <br /><textarea id="results" rows="4" cols="60"></textarea>
    <br /><input type="button" value="Search" onClick="findData();" />&nbsp;&nbsp;<input class="hostcontrol" type="button" value="Close" onClick="google.script.host.close();" />
    &nbsp;&nbsp;g&nbsp;&nbsp;<input id="set_g" type="checkbox" />
    &nbsp;&nbsp;i&nbsp;&nbsp;<input id="set_i" type="checkbox" />
    &nbsp;&nbsp;m&nbsp;&nbsp;<input id="set_m" type="checkbox" />
    &nbsp;&nbsp;<input type="button" value="Save Flags" onClick="saveFlags();" />
    &nbsp;&nbsp;<p>Don't leave extra carriage returns in search pattern textbox.</p>
    </div>
  </body>
</html>

And this is the GS code for it:

function onOpen(){
  SpreadsheetApp.getUi().createMenu('My Tools')
  .addItem('Regex Tool', 'showRegexDialog')
  .addToUi();
}

function showRegexDialog(){
  var ui=HtmlService.createHtmlOutputFromFile('RegexTester').setWidth(800).setHeight(500);
  SpreadsheetApp.getUi().showModelessDialog(ui, 'Regex Tester');
}

function getLastTextPatternFlags(){
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Input');
  var rg=sh.getDataRange();
  var vA=rg.getValues();
  var rObj={};
  for(var i=0;i<vA.length;i++){
    rObj[vA[i][0]]=vA[i][1];
  }
  Logger.log(rObj);
  return rObj;
}

function saveText(txt){
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Input');
  var rg=sh.getDataRange();
  var vA=rg.getValues();
  for(var i=0;i<vA.length;i++){
    if(vA[i][0]=='text'){
      vA[i][1]=txt;
    }
  }
  rg.setValues(vA);
  return true;
}

function savePattern(txt){
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Input');
  var rg=sh.getDataRange();
  var vA=rg.getValues();
  for(var i=0;i<vA.length;i++){
    if(vA[i][0]=='pattern'){
      vA[i][1]=txt;
    }
  }
  rg.setValues(vA);
  return true;
}

function saveFlags(flagObj){
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Input');
  var rg=sh.getDataRange();
  var vA=rg.getValues();
  for(var i=0;i<vA.length;i++){
    var n=String(vA[i][0]).toLowerCase();
    if(n=='g' || n=='i' || n=='m'){
      vA[i][1]=flagObj[n];
    }
  }
  rg.setValues(vA);
  return true;
}


function doGet(){
  var output=HtmlService.createHtmlOutputFromFile('RegexTester');
  output.append('<style>.hostcontrol{display:none;}</style>');
  return output.setXFrameOptionsMode(HtmlService.XFrameOptionsMode.ALLOWALL);
}

Pour it into a dialog and play with it..