python爬取百度关键字和联想

python爬取百度输入联想


1.打开百度,右击审查元素

1.png

这里我使用的是chrome浏览器

2.获取信息

找到相应的数据

'GET':url,
'HOST':'sp0.baidu.com',
'Referer':'https://www.baidu.com/?tn=91694651_hao_pg',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0'

3.利用正则爬取

key_word = re.findall("\"(.*?)\"", res

    num = 0
    for i in key_word:
        num += 1
        if i == "s":
            for item in key_word[num:]:
                print item

这里为什么需要用i == ‘s’,可以自己print i 试试

为什么不用soup.find_all,主要是这个网页的代码是字典形式

23.png

4.下面是完整的代码

#coding: utf-8
import requests
import urllib
import re
import time
from bs4 import BeautifulSoup


list_keyword = ['华为Mate10', '小米Mx2']
for keyword in list_keyword:
    gjc = urllib.quote(keyword)#中文转换成url编码
    url = 'https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd='+gjc+'&json=1&p=3&sid=&csor=2&pwd= &cb=jQuery110207361392755424963_1505220177752&_=1505220177757'
    proxies = {"http":'http://119.5.0.53', "http":'http://140.250.170.110', "http":'http://221.229.46.81'}#使用3个代理ip地址,防止多次爬取屏蔽
    headers = {'GET':url,
                'HOST':'sp0.baidu.com',
                'Referer':'https://www.baidu.com/?tn=91694651_hao_pg',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0'
                }
    html = requests.get(url, headers=headers, proxies=proxies).content#取得html文本,并且使用代理ip
    soup = BeautifulSoup(html, 'html.parser')
    res = soup.get_text()#将html变成文本格式
    print res
    key_word = re.findall("\"(.*?)\"", res)#正则获得字符串里面的字符

    num = 0
    for i in key_word:
        num += 1
        if i == "s":#通过排查得到's'字符后面的字符为关键字
            for item in key_word[num:]:
                print item
    time.sleep(3)#每次爬取要3秒以后

爬取结果:

FTR4EEGL3PRW3N3$66MIORC.png

爬取百度关键字

方法和上面步骤相同,但是这次我在headers里面加入了cookie

代码如下:

#coding: utf-8
import requests
import urllib
from bs4 import BeautifulSoup

list_keyword = ['苹果8', '小米Mx2']
for item in list_keyword:
    gjc = urllib.quote(item)
    url = 'https://www.baidu.com/s?ie=utf-8&mod=1&isid=C72D6237C6C55642&ie=utf-8&f=8&rsv_bp=0&rsv_idx=1&tn=baidu&wd=' + gjc + '&rsv_pq=816a886000062c06&rsv_t=c589dMGIygeVyzfmHXpCrPrWN2S4yWp8ttSNQ77uzbcec5H1cOVF2yiedcs&rqlang=cn&rsv_enter=1&rsv_sug3=7&rsv_sug1=3&rsv_sug7=100&rsv_sug2=0&inputT=89&rsv_sug4=41058&rsv_sid=1424_21119_17001_22072&_ss=1&clist=&hsug=&f4s=1&csor=2&_cr1=28095'
    header = {  'Refer':'https://www.baidu.com/',
                'Host':'www.baidu.com',
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36',
                'Cookie':'BAIDUID=75774ED511BF2DECB6C4A142D1F4CCDF:FG=1; PSTM=1499515257; BIDUPSID=DD4A04E2ED6C099A9F40EF86229306AB; FP_UID=616a141a155bdb648b37b61ff16cfdc3; BDRCVFR[yfg8b4Gp7xm]=yiTPYW-i3eTXZFWmv68mvqV; H_PS_645EC=9607I22jkjSFIiy14y6B2VlNt4TQHDqMpVs%2FNy6mdneG1NMGExHCLS0uRrs; BD_CK_SAM=1; PSINO=5; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BD_HOME=0; H_PS_PSSID=1447_19033_13550_21095_18560_17001_22160; BD_UPN=12314753'
            }

    content = requests.get(url, headers=header).content
    soup = BeautifulSoup(content, 'html.parser')
    keyword = soup.find_all('div', id="rs")
    for i in keyword:
        print i.get_text()

爬取结果:


1.png

推荐阅读更多精彩内容