Python学习日记3| 用python多进程爬取58同城北京地区10w+数据

今天是4.13号。

昨天把会议论文算是完成任务的写完然后提交了,而实习还没有找上,所以最近一段时间应该都会整天在实验室学习python吧,加上最近一个多星期全部都是大雨哪也去不了(说的好像不下雨就会出去转悠一样。本来还想问一下送宋教授现在有什么项目可以跟过去做,但又怕把python的学习拉下,所以还是最近半个月先把这个课程全部学完吧。另外电脑运行pycharm真心带不动,所以也在等家里的那台笔记本寄过来,同时不得不提的是也在等投稿的论文消息,wish there is a good result。


照样在贴上代码之前,总结在实际中新学的知识与所遇到的问题。
(1).快捷键ctrl+/可以多行注释,全部选定后tab可以多行缩进,shift+tab则可以向左缩进。
(2).注意select('')和split('')得到的结果都是列表,所以都要在后面加下标[number]。
(3).X.stripped_strings 用于去除字符串X中包含的空格或空行。同时注意要用list()把那一串数据括起来。
(4).对于多种分类情况时,最好用if语句来进行判断。判断某特点字符串s1是包含在另一字符串s2中,可用if 's1' in 's2'

(5).要关注抓取的数据是网页自带的,还是通过request返回的json数据,一般json都是字典数据。对于浏览量等JS数据,首先在审查元素的network-JS中找到相关网页,然后进行解析。
解析过程包括:将查询网页的id导出,然后用format()直接替换到相应的JS动态网页构造成新的网页;接着跟一般网页解析一样用requests.get()去请求;最后由于JS网页的回应内容都是字符串,所以直接用js.text然后再用相应的split或其他方法截取自己想要的内容。
还一个问题要注意,对于请求JS数据时,记得加上headers包括: 'Referer'和 'User-Agent'


第一段

__author__ = 'guohuaiqi'
#!/usr/bin/env python
# _*_ coding: utf-8 _*_
from bs4 import BeautifulSoup
import requests
import string

url='http://bj.58.com/sale.shtml'
host='http://bj.58.com'

#得到所有商品类目的链接并保存下来
def get_cate_link(url):
    web_data=requests.get(url)
    soup=BeautifulSoup(web_data.text,'lxml')
    allurl=soup.select('#ymenu-side > ul > li > ul > li > b > a')
    for item in allurl:
        cate_link=host+item.get('href')
        #print(cate_link)

# get_cate_link(url)

cate_list="""
    http://bj.58.com/shouji/
    http://bj.58.com/tongxunyw/
    http://bj.58.com/danche/
    http://bj.58.com/fzixingche/
    http://bj.58.com/diandongche/
    http://bj.58.com/sanlunche/
    http://bj.58.com/peijianzhuangbei/
    http://bj.58.com/diannao/
    http://bj.58.com/bijiben/
    http://bj.58.com/pbdn/
    http://bj.58.com/diannaopeijian/
    http://bj.58.com/zhoubianshebei/
    http://bj.58.com/shuma/
    http://bj.58.com/shumaxiangji/
    http://bj.58.com/mpsanmpsi/
    http://bj.58.com/youxiji/
    http://bj.58.com/jiadian/
    http://bj.58.com/dianshiji/
    http://bj.58.com/ershoukongtiao/
    http://bj.58.com/xiyiji/
    http://bj.58.com/bingxiang/
    http://bj.58.com/binggui/
    http://bj.58.com/chuang/
    http://bj.58.com/ershoujiaju/
    http://bj.58.com/bangongshebei/
    http://bj.58.com/diannaohaocai/
    http://bj.58.com/bangongjiaju/
    http://bj.58.com/ershoushebei/
    http://bj.58.com/yingyou/
    http://bj.58.com/yingeryongpin/
    http://bj.58.com/muyingweiyang/
    http://bj.58.com/muyingtongchuang/
    http://bj.58.com/yunfuyongpin/
    http://bj.58.com/fushi/
    http://bj.58.com/nanzhuang/
    http://bj.58.com/fsxiemao/
    http://bj.58.com/xiangbao/
    http://bj.58.com/meirong/
    http://bj.58.com/yishu/
    http://bj.58.com/shufahuihua/
    http://bj.58.com/zhubaoshipin/
    http://bj.58.com/yuqi/
    http://bj.58.com/tushu/
    http://bj.58.com/tushubook/
    http://bj.58.com/wenti/
    http://bj.58.com/yundongfushi/
    http://bj.58.com/jianshenqixie/
    http://bj.58.com/huju/
    http://bj.58.com/qiulei/
    http://bj.58.com/yueqi/
    http://bj.58.com/tiaozao/
"""

第二段

__author__ = 'guohuaiqi'
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
from bs4 import BeautifulSoup
import requests
import time
import pymongo
import sys

client=pymongo.MongoClient('localhost',27017)
tongcheng=client['tongcheng']
urllist=tongcheng['urllist']
content=tongcheng['content']


#爬取所有商品的链接保存下来,这里的url来自cate_list
def get_content_links(cate_url,page):
    # http://bj.58.com/danche/pn2/ 这里要构造函数,不然传来的类目链接只是进来后的首页
    page_list='{}pn{}/'.format(cate_url,str(page))
    web_data=requests.get(page_list)
    soup=BeautifulSoup(web_data.text,'lxml')
    time.sleep(1)
    if soup.find('td','t'):
        allurl=soup.select('td.t a.t')
        for url1 in allurl:
            content_link=url1.get('href').split('?')[0]
            if 'bj.58.com' not in content_link:
                pass
            else:
                urllist.insert_one({'url':content_link})
                # print(content_link)
                get_item_content(content_link)
    else:
        pass

# cate_url='http://bj.58.com/youxiji/'
# get_content_links(cate_url,20)

# 爬取每个页面的详情内容,包括标题,时间,价格,区域
def get_item_content(content_link):
# 先判断数据是否来自58,将来自精品或者转转的数据,统一不要
#     for url2 in content_link:
#         if 'bj.58.com' not in url2:
#             pass
#         else:
    try:
        web_data1=requests.get(content_link)
        soup=BeautifulSoup(web_data1.text,'lxml')
        page_not_exist = '404' in soup.find('script',type='text/javascript').get('src').split('/')
        if page_not_exist:
            pass
        else:
            if '区域' in soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_tit')[0].get_text():
                if soup.find_all('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span'):
                    district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')[0].stripped_strings)
                else:
                    district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con')[0].stripped_strings)
            elif '区域' in soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_tit')[0].get_text():
                if soup.find_all('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span'):
                    district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')[0].stripped_strings)
                else:
                    district=list(soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con')[0].stripped_strings)
            else:
                district=None
            data={
                'goods_cate':soup.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a')[0].text.strip(),
                'title':soup.select('#content h1')[0].text.strip(),
                'date':soup.select('#content li.time')[0].text.replace('.','-'),
                'price':soup.select('span.price.c_f50')[0].text.replace('元','').strip() if '面议'not in soup.select('span.price.c_f50')[0].text else None,
                'district':district
                }
            content.insert_one(data)
            # print(data)
    except requests.ConnectionError as e:
        print(e.response)
#
# b=['http://bj.58.com/shuma/23190415633187x.shtml','http://bj.58.com/yishu/25471342844357x.shtml','http://bj.58.com/shouji/25683386143296x.shtml','http://bj.58.com/shuma/23425779899550x.shtml']
# get_item_content(b)
# get_content_links('http://bj.58.com/shouji/',20)

第三段

# _*_ coding: utf-8 _*_
#!/usr/bin/env python
__author__ = 'guohuaiqi'
from multiprocessing import Pool
from get_cate_link import cate_list
from get_all_contents import get_content_links,urllist,content

# 加入断点续传机制,在出现断开后,用rest_list替换pool,map()函数中的cate_links
db_urllist=[item['url'] for item in urllist.find()]
content_urllist=[item['url'] for item in content.fina()]
x=set(db_urllist)
y=set(content_urllist)
rest_list=x-y

def get_all_links(cate_url):
    for page in range(1,101):
        get_content_links(cate_url,page)

if __name__=='__main__':
    pool=Pool()
    pool.map(get_all_links,cate_list.split())

第四段
最后再加上一个count函数来对数据库中的item计数

__author__ = 'guohuaiqi'
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
import time
from get_all_contents1 import content

while True:
    print(content.find().count())
    time.sleep(3)

再要注意的就是,一定一定在写代码前在最前面加上:
#!/usr/bin/env python
_
_ coding: utf-8 _
_**

在爬取了10745条数据后自己手动停止了程序,一共花了差不多12分钟。

推荐阅读更多精彩内容