很喜欢李狗嗨,四重奏,海女,就做了一个日剧豆瓣评分9.0以上的爬虫,很小的程序,仅供娱乐。
import re
import urllib2
class DouBanSpider() :
def __init__(self) : #这里是初始化参数,两个英文下划线
self.number=1
self.page = 1
self.cur_url = "https://www.douban.com/doulist/3116346/?start={page}&sort=seq&sub_type="
self.title = []
self.score= []
self.info=[]
self._top_num = 1
self. filePath ="D:\demo\douban.txt"
print "豆瓣爬虫准备就绪, 准备爬取数据..."
def get_page(self, cur_page) :
url = self.cur_url
try :
my_page = urllib2.urlopen(url.format(page = (cur_page - 1) * 25)).read()
except urllib2.URLError, e :
if hasattr(e, "code"):
print "The server couldn't fulfill the request."
print "Error code: %s" % e.code
elif hasattr(e, "reason"):
print "We failed to reach a server. Please check your url and read the Reason"
print "Reason: %s" % e.reason
return my_page
def find_title(self, my_page) :#爬日剧名字
pattern=re. compile(ur'<a href="https://movie.douban.com/subject/\d+/" target="_blank">\s*([\s\S]*?)\s', re.S)
movie_items=pattern.findall(my_page)
for movie in movie_items:
if movie!='<img':
self.title. append(movie)
def find_score(self, my_page):#爬日剧评分
pattern = re.compile(ur'</span>(\d.\d)', re.S)
movie_items = pattern.findall(my_page)
for score in movie_items:
self.score.append(score)
def find_info(self,my_page):#爬日剧信息
pattern = re.compile(ur'<div class="abstract">\s*(.*?)</div>',re.S)
movie_items = pattern.findall(my_page)
for info in movie_items:
self.info.append(info.replace('<br />',''))
def write(self):#保存到文本文件
f = open(self.filePath, 'w')
for movie,score,info in zip(self.title,self.score,self.info):
f.write('Top' +str(self.number)+':'+ movie+ '\n')
f.write('评分:'+score+'\n')
f.write(info+'\n'+'\n')
self.number+=1
print"文件写入成功"
f.close()
def start_spider(self) :
"""
爬虫入口, 并控制爬虫抓取页面的范围
"""
while self.page <=3:
my_page = self.get_page(self.page)
self.find_title(my_page)
self.find_score(my_page)
self.find_info(my_page)
self.page += 1
if __name__=="__main__": #这里是主程序入口,两个英文下划线
myspider = DouBanSpider()
myspider.start_spider()
myspider.write()
print "豆瓣爬虫爬取结束..."
结果显示: