赞
踩
我正在用python编写一个简单的爬行程序。所以,我用了MySQL和Python。但当我执行这个简单的程序时,会出现一个错误。然后,从web上爬网的内容不会在MySQL表上更新。此错误消息显示编程错误和语法错误。但我不认为我输入了错误的代码。因为问题点有HTML标记。错误消息中出现HTML选项卡的原因。我认为MySQL和Python之间有问题。这是错误信息。Traceback (most recent call last):
File "crawl.py", line 237, in
parseArticle( u )
File "crawl.py", line 166, in parseArticle
db.updateURL( url , contents )
File "crawl.py", line 206, in updateURL
self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))
File "/usr/lib/python2.7/dist-packages/MySQLdb/cursors.py", line 174, in execute
self.errorhandler(self, exc, value)
File "/usr/lib/python2.7/dist-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
ProgrammingError: (1064, 'You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near \'\xeb\x8f\x8b\xec\x9b\x80\', dotum, sans-serif; }\r\n\t//-->\r\n\t\n
def __init__(self):
self.conn = MySQLdb.connect(db='crawlDB', user='root', passwd='qltkd')
self.conn.query("set character_set_connection=utf8;")
self.conn.query("set character_set_server=utf8;")
self.conn.query("set character_set_client=utf8;")
self.conn.query("set character_set_results=utf8;")
self.conn.query("set character_set_database=utf8;")
self.cursor = self.conn.cursor()
self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url CHAR(150), state INT, content TEXT)')
def commit(self):
self.conn.commit()
def __del__(self):
self.conn.commit()
self.cursor.close()
def insertURL(self, url, state=0, content=None):
#'/' delete
if url[-1]=='/': url=url[:-1]
try:
self.cursor.execute("INSERT INTO urls VALUES ('%s',%d,'%s')"%(url,state,content))
except:
return 0
else:
return 1
def selectUncrawledURL(self):
self.cursor.execute("SELECT * FROM urls where state=0")
return [ row[0] for row in self.cursor.fetchall() ]
def updateURL(self, url, content, state=1):
if url[-1]=='/': url=url[:-1]
self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))
def isCrawledURL(self, url):
if url[-1]=='/': url=url[:-1]
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s' AND state=1"%url)
ret = self.cursor.fetchone()
return ret[0]
db = DB()
if __name__=='__main__':
print 'starting crawl.py...'
contents = getContent( mainpage )
URLs = getArticleInfo( BeautifulSoup( contents ) )
nSuccess = 0
for u in URLs:
nSuccess += db.insertURL( u )
print 'inserted %d new pages.'%nSuccess
while 1:
uncrawled_urls = db.selectUncrawledURL()
if not uncrawled_urls: break
for u in uncrawled_urls:
print 'downloading %s'%u
try:
parseArticle( u )
except:
traceback.print_exc()
db.updateURL( u, -1 )
db.commit()
#bs.UpdateIndex()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。