主要步骤:
1、分析蘑菇租房的请求url,观察到总共是28页数据,数据以JSON格式返回,还比较简单,不涉及到解析html
2、通过规律,构建不同到请求参数,循环通过requests调用url进行爬取数据
3、把爬来的数据存储到本地Excel中
#coding:utf-8
import requests
import json
import xlsxwriter
import time
import random
# 构建不同的参数
def url_data():
print "开始进行爬虫程序"
dataList=[]
count=1
for num in range(1,29):
root_data = {
'currentPage': num,
'cityId': 289,
'showCount': 18
}
root_data['currentPage']=num
dataList.append(root_data)
print "成功构建第"+str(count)+"条请求参数"
count+=1
print "成功构建所有请求参数,一共构建了"+str(count-1)+'条请求参数'
return dataList
# 获取房子数据
def data_parser(dataList,root_url,headers):
time.sleep(random.random()*8)
roomList = []
count=1
for data in dataList:
response=requests.post(root_url,headers=headers,data=data)
dataJson=json.loads(response.text)
resList=dataJson['content']['list']
for resInfo in resList:
resDict={
'title':resInfo['title'],
'subTitle':resInfo['subTitle'],
'showPrice':resInfo['showPrice'],
'location': resInfo['location'],
'lat': resInfo['lat'],
'lng': resInfo['lng'],
'detailDesc':resInfo['detailDesc'],
'cityId':resInfo['cityId'],
'communityId':resInfo['communityId']
}
roomList.append(resDict)
print '成功把第'+str(count)+'条房子信息存储到列表'
count+=1
print "房子信息获取成功,一共获取到"+str(count-1)+"条数据"
return roomList
# 把数据存储到表格
def sort_excel(roomList):
print "开始存储数据到Excel"
book = xlsxwriter.Workbook('上海租房数据.xlsx') # 创建book
sheet = book.add_worksheet() # 根据book创建表
#定义表头
sheet.write('A1','title')
sheet.write('B1','subTitle')
sheet.write('C1','showPrice')
sheet.write('D1','location')
sheet.write('E1','lat')
sheet.write('F1','lng')
sheet.write('G1','detailDesc')
sheet.write('H1','cityId')
sheet.write('I1','communityId')
row=1
col=0
count=1
for roomInfo in roomList:
sheet.write(row,col,roomInfo['title'])
sheet.write(row, col+1, roomInfo['subTitle'])
sheet.write(row, col+2, roomInfo['showPrice'])
sheet.write(row, col+3, roomInfo['location'])
sheet.write(row, col+4, roomInfo['lat'])
sheet.write(row, col+5, roomInfo['lng'])
sheet.write(row, col+6, roomInfo['detailDesc'])
sheet.write(row, col+7, roomInfo['cityId'])
sheet.write(row, col +8, roomInfo['communityId'])
row+=1
print "成功存储第"+str(count)+"条房子数据到excel中"
count+=1
print "存储完毕,一共存储了"+str(count-1)+"条房子数据"
book.close()
print "爬虫程序结束"
if __name__=='__main__':
root_url='https://api.mgzf.com/room-find-web/find/list'
headers={
'Content-Type':'application/x-www-form-urlencoded',
'origin':'http://www.mgzf.com',
'referer':"http://www.mgzf.com/list/pg28/",
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
dataList=url_data()
roomList=data_parser(dataList,root_url,headers)
sort_excel(roomList)