新浪云上的定时鼠绘漫画爬虫

我放在新浪云上的小爬虫，每天定时访问鼠绘漫画，如果有更新，就下载对应的更新，并保存起来，下面是详细的代码说明，因为是自己的总结，如果说明不一定会面面俱到。

实现鼠绘爬虫：

鼠绘的网站：http://www.ishuhui.com/ 鼠绘是个良心漫画网站，因为各种原因，鼠绘上的漫画无法保持很长时间，为了满足我自己的需求，写了这个小爬虫。

#coding=utf-8
import sys
import time
import os
import json
import requests
import math
from sys import getsizeof
reload(sys)
sys.setdefaultencoding('utf-8')
gloab_time = 0
global_data = 0
isbreak = False
def home():
    #全局变量
    url_02 = "http://api.ishuhui.com/index/ish/ver"
    shuhui_load_dict = {}
    ir = requests.get(url_02)
    download_id_list = getdown_id()
    book_json = json.loads(ir.content)
    
    time_over = time.strftime('%y-%m',time.localtime(time.time()))
    print time_over
    
    for key in range(0,10):
        temp_array = []
        chapater_id =  book_json['data']['cartoonLatest'][key]['id']
        chapater_book =  book_json['data']['cartoonLatest'][key]['book']
        chapater_number =  book_json['data']['cartoonLatest'][key]['number']
        chapater_title =  book_json['data']['cartoonLatest'][key]['title']
        chapater_name =  book_json['data']['cartoonLatest'][key]['name']
        temp_array = [chapater_name,chapater_title,chapater_id]
        #在添加新漫画前介绍
        if isbreak:
            if str(chapater_id)  not in download_id_list:
                continue
            else:
                shuhui_load_dict[chapater_id] = temp_array
                continue
        shuhui_load_dict[chapater_id] = temp_array    
        

        if str(chapater_id) in download_id_list:
            continue       

        print "新的漫画".encode('GBK') + str(chapater_id)        
        url_04 = "http://hhzapi.ishuhui.com/cartoon/post/ver/c1579509/id/" + str(chapater_id) + '.json'
        ir = requests.get(url_04)
        decodejson = json.loads(ir.content)
        picture_nus = decodejson['data']['content_img']
        if len(json.loads(picture_nus)) == 0:
            #实现第二套方案
            '''
            url_04 = "http://hhzapi.ishuhui.com/cartoon/post/ver/c1579509/id/" + str(chapater_id-1) + '.json'
            ir = requests.get(url_04)
            decodejson = json.loads(ir.content)
            picture_nus = decodejson['data']['content_img']
            print "重新下载".encode('GBK')
            print chapater_id
            '''
            numstr = str(chapater_book) + "-0-" + "n-" + str(chapater_number)
            url_load = "http://api.ishuhui.com/cartoon/post/ver/f6887413/num/" + numstr + '.json'
            #url_04 = "http://hhzapi.ishuhui.com/cartoon/post/ver/c1579509/id/" + str(8979) + '.json'
            ir = requests.get(url_load)
            decodejson = json.loads(ir.content)
            down_id = decodejson['data']['posts'][0]['url'].replace("http://hanhuazu.cc/cartoon/post?id=","")
            url_04 = "http://hhzapi.ishuhui.com/cartoon/post/ver/c1579509/id/" + str(down_id) + '.json'
            print url_04
            ir = requests.get(url_04)
            decodejson = json.loads(ir.content)
            #load_picture(decodejson)
        
        load_picture(decodejson)    
    
    file = open("download_id_list.txt",'w')
    file.writelines(dict_to_text(shuhui_load_dict))
    file.close()

    
def dict_to_text(data_dict):
    outstr = ""
    for key in data_dict:
        outstr = outstr + str(data_dict[key][0]) + '\t' + str(data_dict[key][1]) + '\t' + str(data_dict[key][2]) + '\n'
        
    return outstr
        
    
def getdown_id():     
    path_file = "download_id_list.txt"
    download_id_list = []
    if os.path.exists(path_file):
        file = open(path_file,'r')
        for line in file.readlines():
            # 读取文档内容，循环存入漫画id
            download_id_list.append(line.strip().split('\t')[2])

    else:
        file = open(path_file,'w')
        file.close()
        
    return download_id_list    
    
        
def loading_test(nums):
    url_04 = "http://hhzapi.ishuhui.com/cartoon/post/ver/c1579509/id/" + str(nums) + '.json'
    decodejson = json.loads(ir.content)
    load_picture(decodejson) 

    
def load_picture(load_decodejson):
    global global_data
    global isbreak
    picture_dict = load_decodejson['data']['content_img']
    load_book_text = load_decodejson['data']['book_text']
    load_title = load_decodejson['data']['title']
    #这里在新浪云要变换下
    serchfile = load_book_text + "\\" + load_title
    num = 0
    print "now loading:" + serchfile
    picture_dict = json.loads(picture_dict)
    sort_key = sorted(picture_dict)
    mkdir(load_book_text)
    mkdir(load_book_text + "\\" + load_title)
    
    time_over = time.strftime('%y-%m',time.localtime(time.time()))
    mkdir(time_over)
    mkdir(time_over + "\\" + serchfile)
    monthfile = time_over + "\\" + serchfile
    bar = ProgressBar(total = len(sort_key))     
    
    for key in sort_key:
        #wait_url(time.time())
        load_url = "http://pic01.ishuhui.com" + picture_dict[key].replace('/upload',"")
        #print load_url
        ir = requests.get(load_url,timeout=10)
        global_data = global_data + int(sys.getsizeof(ir.content))
        sz = open(r'%s\%s%s.jpg'%(serchfile,load_title,num) , 'wb').write(ir.content)
        kz = open(r'%s\%s%s.jpg'%(monthfile,load_title,num) , 'wb').write(ir.content)
        #print num
        num = num + 1  
        bar.move()
        bar.log()
    print load_title + "下载完成"
    
    if global_data > 1024*1024*12:
        print "data > 12M"
        isbreak = True

#如果图片下载时间<1秒 等待        
def wait_url(nowtime):
    global gloab_time
    if gloab_time == 0:
        gloab_time = nowtime
    else:
        if nowtime - gloab_time < 2:
            time.sleep(nowtime - gloab_time)
            
            
def mkdir(path):
 
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    print path
    isExists=os.path.exists(path)
 
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        #print path+' 创建成功'
        # 创建目录操作函数
        
        os.makedirs(path)
        return True
    else:
        # 如果目录存在则不创建，并提示目录已存在
        #print path+' 目录已存在'
        return False

requestSession = requests.session()
UA = 'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-CN; \
HTC Velocity 4G X710s Build/IML74K) AppleWebKit/534.30 \
(KHTML, like Gecko) Version/4.0 UCBrowser/10.1.3.546 \
U3/0.8.0 Mobile Safari/534.30' # UC UA
requestSession.headers.update({'User-Agent': UA})



def getHtml():
    # path url = http://m.ac.qq.com/chapter/index/id/505441/cid/1
    #v2ex_session = requests.Session()
    #f = v2ex_session.get(login_page)
    
    cid_page = requestSession.get('http://www.37zw.net/1/1257/619223.html').text   
    #op = requestSession.get("http://m.ac.qq.com/chapter/index/id/505441/cid/1")
    #print op.text
    file = open("myhtml.html",'w')
    file.writelines(cid_page)
    file.close()
    
def getSizeInNiceString(sizeInBytes):
    for (cutoff, label) in [(1024*1024*1024, "GB"),(1024*1024, "MB"),(1024, "KB"),]:
        if sizeInBytes >= cutoff:
            return "%.1f %s" % (sizeInBytes * 1.0 / cutoff, label)
    if sizeInBytes == 1:
        return "1 byte"
    else:
        bytes = "%.1f" % (sizeInBytes or 0,)
    return (bytes[:-2] if bytes.endswith('.0') else bytes) + ' bytes'
    
def ByteFormat(size,unit='Bytes'):
    units = ['Bytes','KB','MB','GB','TB','PB']
    return ('%.2f'+" "+unit) % (size/math.pow(1024,units.index(unit)))    
    
    
class ProgressBar:
    def __init__(self, count = 0, total = 0, width = 50):
        self.count = count
        self.total = total
        self.width = width
    def move(self):
        self.count += 1
    def log(self):
        sys.stdout.write(' ' * (self.width + 9) + '\r')
        sys.stdout.flush()
        #print s
        progress = self.width * self.count / self.total
        sys.stdout.write('{0:3}/{1:3}: '.format(self.count, self.total))
        sys.stdout.write('#' * progress + '-' * (self.width - progress) + '\r')
        if progress == self.width:
            sys.stdout.write('\n')
        sys.stdout.flush()
 

if __name__=='__main__':
    home()

爬虫第一步自然是抓包，然后分析，直接使用requests进行请求无法获得内容，对于这类网站来说太正常不过，也许是需要调用了js，也许是ajax,目前我没能找到requests可以突破js，和ajax加载的办法，233.
期间走了不少弯路，甚至试过selenium和phantomjs这个万能选项，可以实现，但速度太慢。后来在页面中发现了两类json文件，http://api.ishuhui.com/index/ish/ver 和http://hhzapi.ishuhui.com/cartoon/post/ver/c1579509/id/ .鼠绘漫画的结构是不定期的更新新出的漫画，放在主页进行展示，最多展示10个不同的漫画，第一个请求返回的json文件就包含主页面这10个最新漫画的信息，第二个链接后面的数字信息，就是从第一个json请求中获取的。
第二个就是页面的信息，但是呢实际过程中发现这样还是有些问题的，因为第一个json文件中的内容不一定完全正确，具体实现机制不明，但是偶尔会有一些漫画的编号错误。这样就要使用第二类json文件
···
numstr = str(chapater_book) + "-0-" + "n-" + str(chapater_number)
url_load = "http://api.ishuhui.com/cartoon/post/ver/f6887413/num/" + numstr + '.json'
···
代码是这样，直接获取该漫画的内容。

然后就是代码放入到新浪云要做的修改，新浪云一天最低的花费是0.1元，提供20M的外网访问流量,因为网站更新不会那么频繁，没有必要一次完成全部的下载，所以我写了个流量计算的方法，如果下载的数据多余12M结束今天的下载，第二天继续。
github的地址https://github.com/zengaorong/spider
vendor是新浪云用来放外部插件的文件夹，打包好的插件放里面就行，但是与路径有关的大部分插件都会有问题，也就是只有部分插件是能正常使用的。

from sae.storage import Bucket 这个是使用新浪云的storage要用到的。
bucket.put_object(storagefile,file_data) 向目标路径写数据
listarray = bucket.get_object_contents('download_id_list.txt') 从目标路径读取数据

基本这样功能就实现好了，定时功能是新浪云实现的，设置每天的6点调用该服务一次，就实现了每天定时爬取鼠绘漫画的功能。

最后编辑于：2017.12.11 06:43:48

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 159,716评论 4赞 364
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 67,558评论 1赞 294
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 109,431评论 0赞 244
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 44,127评论 0赞 209
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 52,511评论 3赞 287
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 40,692评论 1赞 222
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 31,915评论 2赞 313
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 30,664评论 0赞 202
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 34,412评论 1赞 246
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 30,616评论 2赞 245
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 32,105评论 1赞 260
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 28,424评论 2赞 254
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 33,098评论 3赞 238
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 26,096评论 0赞 8
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 26,869评论 0赞 197
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 35,748评论 2赞 276
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 35,641评论 2赞 271

新浪云上的定时鼠绘漫画爬虫

推荐阅读更多精彩内容