2
votes

When I am programming in Ubuntu with python2.7 and MySQLdb, I had an error when I use other languages in python. English only doesn't make this error.

Traceback (most recent call last):    
      File "crawl.py", line 242, in <module>
        parseArticle( u )     
      File "crawl.py", line 146, in parseArticle 
          gatherNeighborInfo( soup )      
      File "crawl.py", line 69, in gatherNeighborInfo
        db.updateURL( url , '자신의 글 주소들을 db에 저장합니다' )     
      File "crawl.py", line 211, in updateURL  self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url)) 
UnicodeDecodeError: 'ascii' codec can't decode byte 0xec in position 33: ordinal not in range(128)

So i tried to change ascii to utf-8. I made a file named sitecustomize.py on the /usr/local/lib/python2.7/site-packages. and sitecustomize.py source code is below.

import sys
sys.setdefaultencoding("utf-8")

but there's nothing changed. please help me. here is the whole source code.

 # -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import robotparser
import urllib2
import time, traceback, re
import MySQLdb

crawler_name = 'daum_blog_crawler'      
mainpage = 'http://blog.daum.net/'      

# robot parser setting.
rp = robotparser.RobotFileParser( mainpage + 'robots.txt' )
rp.read()

def canFetch( url ):
        return rp.can_fetch( crawler_name, url )

def getContent( url, delay=1):
        time.sleep( delay )

        if not canFetch( url ):
                #print 'This url can NOT be fetched by our crawler :', url
                return None
        try:
                opener = urllib2.build_opener()
                opener.addheaders = [('User-agent',crawler_name)]
                contents = opener.open(url).read()
        except:
                traceback.print_exc()
                return None
        return contents

def getArticleInfo( soup ):

        rBlog = re.compile('.+blog.daum.net/\w+/\d+.*?')
        URLs = soup('a',{'href':rBlog})

        return [ u.get('href').split('?')[0] for u in URLs ]

def getOwnArticles( contents ):
        ret = []
        soup = BeautifulSoup( contents )
        rBlog = re.compile('.+/BlogTypeView.+')
        for u in soup('a',{'href':rBlog}):
                href = u.get('href')
                article = href.split('articleno=')[1].split('&')[0]
                if ret.count(article)<1:
                        ret.append( article )
        return ret

def gatherNeighborInfo( soup ):


        rBlog = re.compile('http://blog.daum.net/\w+')
        Neighbors = soup('a',{'href':rBlog})
        cnt = 0
        for n in Neighbors:
                url = n.get('href')
                blogname = url.split('/')[-1]
                if url and url.startswith('http://') and db.isCrawledURL(url)<1:
                        db.insertURL( url, 1 ) 
            db.updateURL( url , '자신의 글 주소들을 db에 저장합니다' )

                        url2 = getRedirectedURL( url )
                        if not url2: continue
                        re_url = 'http://blog.daum.net' + url2
                        body = getContent( re_url, 0 ) 
                        if body:
                                for u in getOwnArticles( body ):

                                        fullpath = 'http://blog.daum.net/'+blogname+'/'+u
                                        cnt+=db.insertURL( fullpath )

        if cnt>0: print '%d neighbor articles inserted'%cnt

def getRedirectedURL( url ):

        contents = getContent( url )
        if not contents: return None

        #redirect
        try:
                soup = BeautifulSoup( contents )
                frame = soup('frame')           
                src = frame[0].get('src')
        except:
                src = None
        return src

def getBody( soup, parent ):

        rSrc = re.compile('.+/ArticleContentsView.+')
        iframe = soup('iframe',{'src':rSrc})
        if len(iframe)>0: 
                src = iframe[0].get('src')
                iframe_src = 'http://blog.daum.net'+src


                req = urllib2.Request( iframe_src )
                req.add_header('Referer', parent )
                body = urllib2.urlopen(req).read()
                soup = BeautifulSoup( body )
                return str(soup.body)
        else:
                print 'NULL contents'
                return ''

def parseArticle( url ):

        article_id = url.split('/')[-1]
        blog_id = url.split('/')[-2]

        if blog_id.isdigit():
                print 'digit:', url.split('/')


        newURL = getRedirectedURL( url )

        if newURL:

                newURL = 'http://blog.daum.net'+newURL
                print 'redirecting', newURL
                contents = getContent( newURL, 0 )
                if not contents:
                        print 'Null Contents...'

                        db.updateURL( url, -1 )
                        return


                soup = BeautifulSoup( contents )


                gatherNeighborInfo( soup )              


                n=0
                for u in getArticleInfo( soup ):
                        n+=db.insertURL( u )
                if n>0: print 'inserted %d urls from %s'%(n,url)

                sp = contents.find('<title>')
                if sp>-1:
                        ep = contents[sp+7:].find('</title>')
                        title = contents[sp+7:sp+ep+7]
                else:
                        title = ''

                contents = getBody( soup, newURL )  


                pStyle = re.compile('<style(.*?)>(.*?)</style>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
                contents = pStyle.sub('', contents)

        pStyle = re.compile('<script(.*?)>(.*?)</script>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
                contents = pStyle.sub('', contents)

        pStyle = re.compile("<(.*?)>", re.IGNORECASE | re.MULTILINE | re.DOTALL )
                contents = pStyle.sub("", contents)


                db.updateURL( url , '처리했다고 db에 표시합니다.' )

        else:
                print 'Invalid blog article...'

                db.updateURL( url, 'None', -1 )

class DB:
        "MySQL wrapper class"
        def __init__(self):
                self.conn = MySQLdb.connect(db='crawlDB', user='root', passwd='......')
                self.cursor = self.conn.cursor()
                self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url CHAR(150), state INT, content TEXT)')
        def commit(self):
                self.conn.commit()
        def __del__(self):
                self.conn.commit()
                self.cursor.close()

        def insertURL(self, url, state=0, content=None):
                if url[-1]=='/': url=url[:-1]
                try:    
                        self.cursor.execute("INSERT INTO urls VALUES ('%s',%d,'%s')"%(url,state,content))
                except:
                        return 0
                else:
                        return 1

        def selectUncrawledURL(self):
                self.cursor.execute('SELECT * FROM urls where state=0')
                return [ row[0] for row in self.cursor.fetchall() ]

        def updateURL(self, url, content, state=1):
                if url[-1]=='/': url=url[:-1]
                self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))

        def isCrawledURL(self, url):
                if url[-1]=='/': url=url[:-1]
                self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s' AND state=1"%url)
                ret = self.cursor.fetchone()
                return ret[0]

db = DB()

if __name__=='__main__':
        print 'starting crawl.py...'

        contents = getContent( mainpage )
        URLs = getArticleInfo( BeautifulSoup( contents ) )
        nSuccess = 0
        for u in URLs:
                nSuccess += db.insertURL( u )
        print 'inserted %d new pages.'%nSuccess

        while 1:
                uncrawled_urls = db.selectUncrawledURL()
                if not uncrawled_urls: break
                for u in uncrawled_urls: 

                        print 'downloading %s'%u
                        try:
                                parseArticle( u )
                        except:
                                traceback.print_exc()
                                db.updateURL( u, -1 )
                        db.commit()
                #bs.UpdateIndex()
3
I suspect you have opened the database in ascii mode or initialised it to run in ascii.Steve Barnes
never ever touch the Python default encoding.Andreas Jung
oh, my MySQL mode is utf8. I have changed it using default-character-set = utf8Moon Taejin
But did you recreate crawlDB after that change? Dbs are sticky!Steve Barnes
I recreate crawlDB now as your command. But same error appear. But there's a different thing. 'ascii' error changed into 'latin-1'. what is the problem. i exactly change database character set as utf8_unicode_ciMoon Taejin

3 Answers

2
votes

Specify charset when connect

self.conn = MySQLdb.connect(db='crawlDB', user='root', passwd='......', charset='utf8')

Replace following line:

self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))

with (separating the sql from the parameters):

self.cursor.execute("UPDATE urls SET state=%s, content=%s WHERE url=%s", (state,content,url))

Example session:

>>> import MySQLdb
>>> db = MySQLdb.connect('localhost', db='test', charset='utf8')
>>> cursor = db.cursor()
>>> cursor.execute('DROP TABLE IF EXISTS urls')
0L
>>> cursor.execute('CREATE TABLE urls(url char(200), state int, content text)')
0L
>>> cursor.execute('INSERT INTO urls(url, state, content) VALUES(%s, %s, %s)', ('http://daum.net/', 1, u'\uc548\ub155'))
1L
>>> cursor.execute('SELECT * FROM urls')
1L
>>> for row in cursor.fetchall():
...     print row
...
(u'http://daum.net/', 1L, u'\uc548\ub155')
0
votes

Since you are generating the MySql commands into strings you need those strings to be unicode strings try changing all the cursor.execute(" lines to cursor.execute(u"

0
votes

Try to change envirement variable “PYTHONIOENCODING” to “utf_8″. If you don't want to export it you can just do something like this

PYTHONIOENCODING=utf-8 python myproject.py

Also, you have to use u"" strings.