I am trying to build a wikipedia link crawler on google app engine. I wanted to store an index in the datastore. But I run into the DeadlineExceededError for both cron jobs and task queue.
for the cron job I have this code:
def buildTree(self):
start=time.time()
self.log.info(" Start Time: %f" % start)
nobranches=TreeNode.all()
for tree in nobranches:
if tree.branches==[]:
self.addBranches(tree)
time.sleep(1)
if (time.time()-start) > 10 :
break
self.log.info("Time Eclipsed: %f" % (time.time()-start))
self.log.info(" End Time:%f" % time.clock())
I don't understand why the for loop doesn't break after 10 seconds. It does on the dev server. Something must be wrong with the time.time() on the server. Is there another function I can use?
for the task queue I have this code:
def addNewBranch(self, keyword, level=0):
self.log.debug("Add Tree")
self.addBranches(keyword)
t=TreeNode.gql("WHERE name=:1", keyword).get()
branches=t.nodes
if level < 3:
for branch in branches:
if branch.branches == []:
taskqueue.add(url="/addTree/%s" % branch.name)
self.log.debug("url:%s" % "/addTree/%s" % branch.name)
The logs show that they both run into the DeadlineExceededError. Shouldn't background processing have a longer that the 30 seconds for the page request. Is there a way around the exception?
Here is the code for addBranch()
def addBranches(self, keyword):
tree=TreeNode.gql("WHERE name=:1", keyword).get()
if tree is None:
tree=TreeNode(name=keyword)
self.log.debug("in addBranches arguments: tree %s", tree.name)
t=urllib2.quote(tree.name.encode('utf8'))
s="http://en.wikipedia.org/w/api.php?action=query&titles=%s&prop=links&pllimit=500&format=xml" % t
self.log.debug(s)
try:
usock = urllib2.urlopen(s)
except :
self.log.error( "Could not retrieve doc: %s" % tree.name)
usock=None
if usock is not None:
try:
xmldoc=minidom.parse(usock)
except Exception , error:
self.log.error("Parse Error: %s" % error)
return None
usock.close()
try:
pyNode= xmldoc.getElementsByTagName('pl')
self.log.debug("Nodes to be added: %d" % pyNode.length)
except Exception, e:
pyNode=None
self.log.error("Getting Nodes Error: %s" % e)
return None
newNodes=[]
if pyNode is not None:
for child in pyNode:
node=None
node= TreeNode.gql("WHERE name=:1", child.attributes["title"].value).get()
if node is None:
newNodes.append(TreeNode(name=child.attributes["title"].value))
else:
tree.branches.append(node.key())
db.put(newNodes)
for node in newNodes:
tree.branches.append(node.key())
self.log.debug("Node Added: %s" % node.name)
tree.put()
return tree.branches
addBrancheslook like? - Amberurlfetchand noturlopento get the Wikipedia page inaddBranches()code.google.com/appengine/docs/python/urlfetch - Jason Hall