bilibili弹幕爬虫 (python)
一个简单的弹幕爬虫,大概说下思路和爬取过程,完整代码放最下面
b站更新了,普通的抓包不好抓了,但办法还是有的:
1、找到cid
https://www.bilibili.com/video/av21483276 如图:
2、提取cid
用正则提取出来即可:
import re
cid = re.findall(r"cid=([\d]+)&",res)[0]
3、通过cid获取包含弹幕的响应的url地址
用字符串拼接即可得到:https://comment.bilibili.com/35358033.xml
4、最后用xpath提取弹幕
item["弹幕"] = html.xpath("//d/text()")
5、保存
完整代码:
import requests
from lxml import etree
import re
from pprint import pprint
import json
class Bili(object):
def __init__(self,url_num):
self.name = "av号"+str(url_num)
self.url = "https://www.bilibili.com/video/av{}".format(url_num)
self.headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
def parse(self, url):
response = requests.get(url,headers=self.headers)
res = response.content.decode()
return res
def get_content(self,res):
html = etree.HTML(res)
item = {}
item["title"] = html.xpath("//div[@id='viewbox_report']/h1/@title")[0] if len(html.xpath("//div[@id='viewbox_report']/h1/@title"))>0 else None
if item["title"]:
item["cid"] = re.findall(r"cid=([\d]+)&",res)[0]
else:
return None
print(item)
return item
def get_url(self, item):
cid = item["cid"]
danmu_url = "https://comment.bilibili.com/{}.xml".format(cid)
print(danmu_url)
return danmu_url
def get_danmu(self, res, item):
# pprint(res)
html = etree.HTML(res.encode())
item["弹幕"] = html.xpath("//d/text()")
# pprint(item)
return item
def save(self, name, content):
with open("{}.json".format(name),"a",encoding="utf-8")as f:
f.write(json.dumps(content,ensure_ascii=False,indent=4))
print("保存成功")
def run(self):
item = {}
# 1 获取url
# 2 发送请求,获取相应
res = self.parse(self.url)
# 3 提取cid和标题
item = self.get_content(res)
if item==None:
print("vid号不正确,请重新输入。")
return
# 4 组合弹幕url
danmu_url = self.get_url(item)
# 5 发送请求获取相应
res_danmu = self.parse(danmu_url)
# 6 提取
end = self.get_danmu(res_danmu, item)
# 7 保存
self.save(self.name,end)
print("程序结束")
if __name__ == '__main__':
# url_num = input("请输入8位视频av号:")
url_num = 21483276
print("url_num=",url_num)
b = Bili(url_num)
b.run()
原文链接:https://blog.csdn.net/qq_22043649/article/details/80985903
https://blog.csdn.net/im4566/article/details/86310601
https://blog.csdn.net/a360316515/article/details/83058481
https://blog.csdn.net/weixin_41185456/article/details/79601563
https://blog.csdn.net/weixin_43994086/article/details/84863237