I use url lib
, urllib2
, cookie lib
to scrape a web:get
the login page and post the data.
def getpage():
codeurl=r"http://www.xxx/sign_in"
request=urllib2.Request(codeurl)
response=urllib2.urlopen(request)
return response
def parsecode(response):
"""
parse the login page to get the changed code
"""
pattern=re.compile(r"""<meta.*?csrf-token.*?content=(.*?)\s/>""")
code=re.findall(pattern,response.read())[0]
return code
def Hand():
"""
deal with cookie and header
"""
headers={
"Referer":"xxx",
"User-Agent":"xxx"
}
ck=cookielib.MozillaCookieJar()
handle=urllib2.HTTPCookieProcessor(ck)
openner=urllib2.build_opener(handle)
head=[]
for key,value in headers.items():
tup=(key,value)
head.append(tup)
openner.addheaders = head
return openner
def postdata(code,openner):
"""
post the data xxx.com needed
"""
logurl=r"http://www.jianshu.com/sessions"
sign_in={"name":"xxx","password":"xxx","authenticity_token":code}
data=urllib.urlencode(sign_in).encode("utf-8")
x=openner.open(logurl,data)
for item in ck:
print item
However,I met this bug:
Traceback (most recent call last):
File "jianshu.py", line 80, in postdata(code,op)
File "jianshu.py", line 43, in postdata x=openner.open(logurl,data)
File "/usr/lib64/python2.7/urllib2.py", line 437, in open response = meth(req, response) File "/usr/lib64/python2.7/urllib2.py", line 550, in http_response 'http', request, response, code, msg, hdrs)
File "/usr/lib64/python2.7/urllib2.py", line 475, in error return self._call_chain(*args)
File "/usr/lib64/python2.7/urllib2.py", line 409, in _call_chain result = func(*args)
File "/usr/lib64/python2.7/urllib2.py", line 558, in http_error_default raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) urllib2.HTTPError: HTTP Error 500: Internal Server Error