I am trying to read a list of URL's that I have on google docs. What I want to do is read the URL's in from google doc spreadsheet then scrape each URL.
import gdata.docs.data
import gdata.docs.client
import gdata.docs.service
import gdata.spreadsheet.service
import re, os
username = '[email protected]'
password = 'mypassword'
doc_name = 'My document'
gd_client = gdata.spreadsheet.service.SpreadsheetsService()
gd_client.email = username
gd_client.password = password
gd_client.source = 'https://docs.google.com/spreadsheet/ccc? key=0AkGb10ekJtfQdG9EOHN0VzRDdVhWaG1kNVEtdVpyRlE#gid=0'
gd_client.ProgrammaticLogin()
q = gdata.spreadsheet.service.DocumentQuery()
q['title'] = doc_name
q['title-exact'] = 'true'
feed = gd_client.GetSpreadsheetsFeed(query=q)
spreadsheet_id = feed.entry[0].id.text.rsplit('/',1)[1]
feed = gd_client.GetWorksheetsFeed(spreadsheet_id)
worksheet_id = feed.entry[0].id.text.rsplit('/',1)[1]
rows = gd_client.GetListFeed(spreadsheet_id, worksheet_id).entry
for row in rows:
for key in row.custom:
urls = row.custom[key].text
newlist = urls
print 'this is a list', newlist
elec_urls = newlist.strip()
#After this each the Url in the list is scraped using scraperwiki
This works fine if I only have one URL in the spredsheet, I don't, when I have more that one URL in the document the program only scrapes the last Url.
I thought using a loop would solve this something to cycle from newlist[0] to newlist[i] but found out that newlist[0] is = to h of the h t t p://(URL) Last entered urls and newlist[1]= t and so on.
Any help would be appreciated thanks.