2019-07-30

#! /usr/bin/env python
# # -*- coding: utf-8 -*-

import datetime
import gzip
import hashlib
import json
import logging
from lxml import etree
import os
import random
import re
import socket
import ssl
import StringIO
import sys
import threading
import threadpool
import time


defencode = 'utf-8'
ssl._create_default_https_context = ssl._create_unverified_context

if sys.version_info[0] == 2:
    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
    import urllib2 as urllib_
elif sys.version_info[0] == 3:
    from http.server import BaseHTTPRequestHandler, HTTPServer
    import urllib.request as urllib_


logging.basicConfig(level=logging.INFO, filename='serv.log', filemode='a',
    format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')


def FLog(msg):
    logging.info(msg)
    print(datetime.datetime.now().strftime('%c') + '\t' + msg)


class NoRedirection(urllib_.HTTPRedirectHandler):
    def http_error_301(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_302(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_303(req, fp, code, msg, hdrs, newurl):
        return code


def httpRequest(url, headers=None, postdata=None, proxy=None):
    if headers is None:
        headers = {
            'User-Agent':'Mozilla/5.0'
        }
    try:
        if proxy is None:
            opener = urllib_.build_opener(NoRedirection)
        else:
            opener = urllib_.build_opener(NoRedirection, urllib_.ProxyHandler({
                "http": proxy,
                "https": proxy,
            }))
        resp = opener.open(urllib_.Request(url, headers=headers, data=postdata))
        data = resp.read()
        if 'content-encoding' in resp.headers and resp.headers['content-encoding'] == 'gzip':
            gz = gzip.GzipFile(fileobj=StringIO.StringIO(data))
            data = gz.read()
            gz.close()
        return resp.code, data
    except Exception as e:
        return 600, None


def downFile(url, path, headers=None, postdata=None, proxy=None):
    code, data = httpRequest(url=url, headers=headers, postdata=postdata, proxy=proxy)
    if code == 200 and data is not None:
        with open(path, 'wb') as f:
            f.write(data)


proxy = None #'127.0.0.1:1080' 被封ip需要换代理

if __name__ == '__main__':
    rootdir = './qiushi'
    for i in range(110006543, 130000000):
        itemdir = rootdir + '/%d' % i
        print('handle %d' % i)
        if os.path.exists(itemdir):
            continue
        url = 'https://m2.qiushibaike.com/article/%d' % i
        code, data = httpRequest(url=url, proxy=proxy, headers={
            'Accept-Encoding': 'gzip, deflate',
            'User-Agent': 'Mozilla/5.0'
        })
        if code != 200:
            continue
        try:
            jdata = json.loads(data)
            jdata['article']['content']
        except Exception as e:
            continue
        os.makedirs(itemdir)
        contentfile = itemdir + '/data'
        with open(contentfile, 'wb') as f:
            f.write(data)
        print('handle %d done' % i)
        if 'high_url' not in jdata['article']:
            continue
        high_url = jdata['article']['high_url']
        highurlfile = itemdir + '/' + os.path.basename(high_url)
        downFile(high_url, highurlfile)


#! /usr/bin/env python
# # -*- coding: utf-8 -*-

import datetime
import gzip
import hashlib
import json
import logging
from lxml import etree
import os
import random
import re
import socket
import ssl
import StringIO
import sys
import threading
import threadpool
import time


defencode = 'utf-8'

# https使用ssl来做证书加密,python有时候无法正常解密,加上这个就可以忽略证书验证。正常获取到https的html响应
ssl._create_default_https_context = ssl._create_unverified_context

# 因为python2和python3的urllib库方法有所区别
if sys.version_info[0] == 2:
    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
    import urllib2 as urllib_
elif sys.version_info[0] == 3:
    from http.server import BaseHTTPRequestHandler, HTTPServer
    import urllib.request as urllib_

# 打印日志
logging.basicConfig(level=logging.INFO, filename='serv.log', filemode='a',
    format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')


def FLog(msg):
    logging.info(msg)
    print(datetime.datetime.now().strftime('%c') + '\t' + msg)

# 用于设置不自动跳转
class NoRedirection(urllib_.HTTPRedirectHandler):
    def http_error_301(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_302(req, fp, code, msg, hdrs, newurl):
        return code
    def http_error_303(req, fp, code, msg, hdrs, newurl):
        return code

'''
http/https请求类型:
请求部分:

传参方式
1.http://www.baidu.com/omn/20190810/20190810A0ND3I00.html?a=1&b=2&aaa=
2.User:lichao -> Header
3.body传参

GET /omn/20190810/20190810A0ND3I00.html?usr=lichao&pass=lihao HTTP/1.1(\r\n)
User-Agent: ...(\r\n)
Cookie:"asl=1,sa=1"
(\r\n\r\n)

POST /omn/20190810/20190810A0ND3I00.html HTTP/1.1(\r\n)
User-Agent: ...(\r\n)
(\r\n\r\n)
body.....

响应部分:
HTTP/1.1 200 OK(\r\n)
Header1:value1(\r\n)
Header2:value2
Cookie:"JSESSIONID=aaghlajalggajlsjkdklflkjas"
...
(\r\n\r\n)
body.....
'''

def httpRequest(url, headers=None, postdata=None, proxy=None):
    if headers is None:
        headers = {
            'User-Agent':'Mozilla/5.0'
        }
    try:
        if proxy is None:
            # opener = urllib_.build_opener() 如果想自动处理跳转
            opener = urllib_.build_opener(NoRedirection)
        else:
            opener = urllib_.build_opener(NoRedirection, urllib_.ProxyHandler({
                "http": proxy,
                "https": proxy,
            }))
        resp = opener.open(urllib_.Request(url, headers=headers, data=postdata)) # 构造http请求
        data = resp.read() # 真正的请求,获取状态码,返回的数据
        if 'content-encoding' in resp.headers and resp.headers['content-encoding'] == 'gzip':
            # 如果响应头告知采用gzip方式压缩,就解压body部分
            gz = gzip.GzipFile(fileobj=StringIO.StringIO(data)) 
            data = gz.read()
            gz.close()
        return resp.code, data
    except Exception as e:
        return 600, None


def downFile(url, path, headers=None, postdata=None, proxy=None):
    code, data = httpRequest(url=url, headers=headers, postdata=postdata, proxy=proxy)
    if code == 200 and data is not None:
        with open(path, 'wb') as f:
            f.write(data)

proxy = None #'127.0.0.1:1080' 被封ip需要换代理

if __name__ == '__main__':
    rootdir = './qiushi'
    for i in range(110006540, 130000000):
        itemdir = rootdir + '/%d' % i
        print('handle %d' % i)
        if os.path.exists(itemdir):
            continue
        url = 'https://m2.qiushibaike.com/article/%d' % i
        code, data = httpRequest(url=url, proxy=proxy, headers={
            'Accept-Encoding': 'gzip, deflate',
            'User-Agent': 'Mozilla/5.0'
        })
        if code != 200:
            continue
        try:
            jdata = json.loads(data)
            jdata['article']['content'] # 取json的/article/content得值
        except Exception as e:
            continue
        os.makedirs(itemdir) 
        contentfile = itemdir + '/data'
        with open(contentfile, 'wb') as f:
            f.write(data)
        print('handle %d done' % i)
        if 'high_url' not in jdata['article']:
            continue
        high_url = jdata['article']['high_url']
        highurlfile = itemdir + '/' + os.path.basename(high_url)
        downFile(high_url, highurlfile)
        # itemdir: ./qiushi/110006543   os.makedirs(itemdir)
        # highurlfile: ./qiushi/110006543/1.img

推荐阅读更多精彩内容

  • 字典和集合 一.字典 什么是字典?(dict) 1)字典字典是容器型数据类型(序列),将{}作为容器的标志,里面多...
    风中逐月fzzy阅读 12评论 0 0
  • 百合晨曦阅读 19评论 0 4
  • 姓名:吴赛玉 公司:北京奕腾时尚科技有限公司 组别:第489期六项精进 利他一组 【日精进打卡第152天】 一、【...
    13811490967阅读 7评论 0 0
  • 2019-07-30 哈尔滨第380期利他二组简书作者 姓名:周翔 扬州市方圆建筑工程有限公司 【日精进打卡第41...
    香蕉香蕉_2917阅读 8评论 0 0
  • 原创:飞儿 最近有不少朋友在讨论关于买保险的问题,现在买保险的渠道越来越多了,除了找保险公司的代理人咨询购买,还可...
    爱自由的飞儿阅读 125评论 0 1