scrapy 请求头中携带 cookie

# -*- coding: utf-8 -*-

cookie="anonymid=jxcn09d5-vd52v0; depovince=GUZ; _r01_=1; ick_login=d41bb8a9-056b-41a7-b187-7c706f0f8702; ick=39b091b8-f882-499b-992b-34a682d3469a; JSESSIONID=abcHJrhG1CAIo64PJRrUw; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244004; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244011; wp_fold=0; td_cookie=18446744069457827825; jebecookies=e9de8580-fb15-4891-b7c3-7c08ebb41f5c|||||; _de=AE9934B6C85831351B86F7DDD5B20F8A; p=b25ab0edb69343d7f80c5e481864b8c30; first_login_flag=1; ln_uact=18620028487; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=b8a0d848e228dd51d2c84609f814495b0; societyguester=b8a0d848e228dd51d2c84609f814495b0; id=971298880; xnsid=6e5116da; ver=7.0; loginfrom=null"

cookies= {}

# 提取键值对 请求头中携带cookie必须是一个字典,所以要把原生的cookie字符串转换成cookie字典

for cookiein cookie.split(';'):

    key, value= cookie.split("=", 1)

    cookies[key] = value

print(cookies)

print("*"*100)

print()

cookies= "anonymid=jxcn09d5-vd52v0; depovince=GUZ; _r01_=1; ick_login=d41bb8a9-056b-41a7-b187-7c706f0f8702; ick=39b091b8-f882-499b-992b-34a682d3469a; JSESSIONID=abcHJrhG1CAIo64PJRrUw; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244004; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244011; wp_fold=0; td_cookie=18446744069457827825; jebecookies=e9de8580-fb15-4891-b7c3-7c08ebb41f5c|||||; _de=AE9934B6C85831351B86F7DDD5B20F8A; p=b25ab0edb69343d7f80c5e481864b8c30; first_login_flag=1; ln_uact=18620028487; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=b8a0d848e228dd51d2c84609f814495b0; societyguester=b8a0d848e228dd51d2c84609f814495b0; id=971298880; xnsid=6e5116da; ver=7.0; loginfrom=null"

cookies= {i.split("=")[0]: i.split("=")[1] for iin cookies.split("; ")}

print(cookies)

要爬取的网页数据只有在登陆之后才能获取,所以我从浏览器中copy了登录后的cookie到scrapy项目settings文件的请求头中,但是程序执行完之后发现并没有获取到数据,控制台打印出来的debug信息提示需要登录,也就是说我在请求头中添加的cookie并没有效果。后来在网上查了资料,发现如果要携带cookie的话是需要设置的,如下所示:

在settings文件第37行的地方,默认是注释掉的,我们只需要解注释,  并把 Fales 改为 True 就可以了,程序会默认使用settings文件请求头中的cookie。   

COOKIES_ENABLED= True       ###    默认 Fales

网上看到一篇文章说是有三种设置cookie的方式,上面说的只是其中一种,也是相对小白的一种,有兴趣的可以看看这篇文章,只是写的有点不是很详细,刚入门的话有点难理解。

# -*- coding: utf-8 -*-

import scrapy, re

class LoginSpider(scrapy.Spider):

    name ='login'    allowed_domains = ['renren.com']

    start_urls = ['http://www.renren.com/971298880/profile']

   def  start_requests(self):

        """  根据cookies模拟登陆人人网,注意settings.py文件的cookies必须是开启的 """        

cookies="anonymid=jxcn09d5-vd52v0; depovince=GUZ; _r01_=1; ick_login=d41bb8a9-056b-41a7-b187-7c706f0f8702; ick=39b091b8-f882-499b-992b-34a682d3469a; JSESSIONID=abcHJrhG1CAIo64PJRrUw; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244004; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244011; wp_fold=0; td_cookie=18446744069457827825; jebecookies=e9de8580-fb15-4891-b7c3-7c08ebb41f5c|||||; _de=AE9934B6C85831351B86F7DDD5B20F8A; p=b25ab0edb69343d7f80c5e481864b8c30; first_login_flag=1; ln_uact=18620028487; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=b8a0d848e228dd51d2c84609f814495b0; societyguester=b8a0d848e228dd51d2c84609f814495b0; id=971298880; xnsid=6e5116da; ver=7.0; loginfrom=null"        cookies = {i.split("=")[0]:i.split("=")[1]foriincookies.split("; ")}

yield scrapy.Request(self.start_urls[0], callback=self.parse,  cookies=cookies)

def   parse(self, response):

        print(re.findall("新用户28487",response.body.decode()))

        yield  scrapy.Request( callback=self.parse_detail )

#这里不需要再添加cookies,因为settings.py文件中开启了,只需要添加一次,

下次请求自动携带'http://www.renren.com/971298880/profile?v=info_timeline',

def   parse_detail(self,response):#详情页

    print(re.findall("新用户28487", response.body.decode()))

推荐阅读更多精彩内容