赞
踩
#-*- coding: utf-8 -*-
importurllib, httplib, urlparseimportsysimportredefhttpExists(url):
host, path= urlparse.urlsplit(url)[1:3]if ':' inhost:#port specified, try to use it
host, port = host.split(':', 1)try:
port=int(port)exceptValueError:print 'invalid port number %r' %(port,)returnFalseelse:#no port specified, use default port
port =Nonetry:
connection= httplib.HTTPConnection(host, port=port)
connection.request("HEAD", path)
resp=connection.getresponse( )if resp.status == 200: #normal 'found' status
found =Trueelif resp.status == 302: #recurse on temporary redirect
found = httpExists(urlparse.urljoin(url,resp.getheader('location', '')))else: #everything else -> not found
print "Status %d %s : %s" %(resp.status, resp.reason, url)
found=FalseexceptException, e:print e.__class__, e, url
found=Falsereturnfound"""根据url获取文件名"""
defgGetFileName(url):if url==None: returnNoneif url=="" : return ""arr=url.split("/")return arr[len(arr)-1]"""根据url下载文件,文件名参数指定"""
defgDownloadWithFilename(url,savePath,file):#参数检查,现忽略
try:
urlopen=urllib.URLopener()
fp=urlopen.open(url)
data=fp.read()
fp.close()print 'download file url :',url
file=open(savePath + file,'w+b')
file.write(data)
file.close()exceptIOError:print "download error!"+urldefgDownload(url,savePath):
fileName=gGetFileName(url)
gDownloadWithFilename(url,savePath,fileName)defgetRexgList(lines,regx,searchRegx):if lines==None : returnlists=[]for line inlines:
ismatch=re.search(regx,line,re.IGNORECASE)ifismatch :
matchs=re.search(searchRegx,line,re.IGNORECASE)if matchs !=None:
groups=matchs.groups()for str ingroups:if str not inlists:
lists.append(str)returnlistsdefcheckLine(lines):for line inlines :
matchs= re.search(r'url
if not httpExists(url): return
try:
page=urllib.urlopen(url)
html=page.readlines()
page.close()returnhtmlexcept:print "getPageLines() error!"
return
defgetCurrentPageImage(url,savePath):
lines=getPageLines(url)print 'lines.length',len(lines)
regxlists= getRexgList(lines,r'src\s*="images(\S+)"',r'src\s*="(\S+)"')if regxlists==None: return
print 'getCurrentPageImage() images.length',len(regxlists)for jpg inregxlists:
jpg=url +jpg
gDownload(jpg,savePath)defgetCSSImages(link,savePath,url):
lines=getPageLines(link)print 'lines.length',len(lines)
regxlists= getRexgList(lines,r'url
print 'getCurrentPageImage() images.length',len(regxlists)for jpg inregxlists:
jpg=url +jpg
gDownload(jpg,savePath)"""根据url获取其上的相关htm、html链接,返回list"""
defgGetHtmlLink(url):#参数检查,现忽略
rtnList=[]
lines=getPageLines(url)
regx= r"""href="?(\S+)\.htm"""
for link in getRexgList(lines,regx,r'href="(\S+)"'):
link=url +linkif link not inrtnList:
rtnList.append(link)printlinkreturnrtnList"""根据url获取其上的相关css链接,返回list"""
defgGetCSSLink(url):#参数检查,现忽略
rtnList=[]
lines=getPageLines(url)
regx= r"""href="?(\S+)\.css"""
for link in getRexgList(lines,regx,r'href="(\S+)"'):
link= url +linkif link not inrtnList:
rtnList.append(link)returnrtnListdefgetPageImage(url,savePath):"""getCurrentPageImage(url,savePath)"""
"""读取其他的CSS,html文件中的图片
links=gGetHtmlLink(url)
for link in links:
print u'get images on link-html读取'
getCurrentPageImage(link,savePath)"""links=gGetCSSLink(url)for link inlinks:print 'get images on link:',link
getCSSImages(link,savePath,url)if __name__ == '__main__':
url= 'http://www.templatemo.com/templates/templatemo_281_chrome/'savePath= 'd:/tmp/'
print 'download pic from [' + url +']'
print 'save to [' +savePath+'] ...'getPageImage(url,savePath)print "download finished"
赞
踩
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。