使用RSelenium+rvest爬取动态或需登录页面教程

参考链接:https://blog.csdn.net/qq_33291559/article/details/80028119
https://www.jianshu.com/p/1fc6a6817160

基本思路:

RSelenium可以操作Selenium来模拟服务器访问URL,通过浏览器渲染后,得到html文件,可以用于对动态页面的爬取,同时它也可以实现浏览器下的操作,如输入账号密码等,也可以维持登录状态,可用于一些需要登录的网站的信息爬取。
rvest用于解析上步获得的html信息,并使用CSS selector或xpath或正则表达式来截取所需信息,通过批量循环完成信息爬取。

基本路线:

RSelenium模拟浏览器工作:

1、页面初始化

remDr<-remoteDriver(browserName="chrome",remoteServerAddr="localhost",port=4445L)

2、浏览器开启:

remDr$open();remDr$navigate(url)#转到该URL

3、移动鼠标选中模块:

bn<-remDr$findElement(using="xpath",value = input)#找到模块;remDr$mouseMoveToLocation(webElement=bn)#将鼠标移动过去

4、输入信息并鼠标或键盘操作:

search<-list("淘宝",key="enter")#输入信息及键盘操作信息;bn$clearElement()#清除原文本框信息;bn$sendKeysToElement(search)#将信息输入文本框中;remDr$cleck()#鼠标点击操作

附:检查操作

1)截屏操作:

remDr$screenshot(display=F,useViewer=T,file="1.png")

2) 查看当前页面信息:

write.table(remDr$getPageSource()[[1]],"2.html")

3)当前页面切换:

myswitch <- function (remDr, windowId) 
{
  qpath <- sprintf("%s/session/%s/window", remDr$serverURL, 
                   remDr$sessionInfo[["id"]])
  remDr$queryRD(qpath, "POST", qdata = list(handle = windowId))
}
myswitch(remDr, remDr$getWindowHandles()[[2]])
remDr$getCurrentUrl()

4)模拟页面滚动

last_height = 0
repeat {
  remDr$executeScript("window.scrollTo(0,document.body.scrollHeight);", list(remDr$findElement("css","body")))
  Sys.sleep(10) 
  new_height=remDr$executeScript("return document.body.scrollHeight", list(remDr$findElement("css","body")))
  if(unlist(last_height) == unlist(new_height)) {break} else 
  {last_height = new_height} 
}

rvest解析页面:

1、解析当前活动页面:

read_html(remDr$getPageSource()[[1]])

2、通过xpath截取信息:

html_nodes(web,xpath='//div[contains(@class,"price g_price g_price-highlight")]')

3、保存信息:

html_text(web,trim = T)

实例演示:

######################################################## 
#-------------------------------------------------------
# Topic:使用RSelenium(模拟浏览器)+rvest爬取淘宝信息(需要登录操作)
# Author:
# Date:Sun Mar 08 18:08:50 2020
# Mail:
#-------------------------------------------------------
########################################################

#在运行前,需要保证Selenium在后台开启
#java -jar xxxxxxselenium.jar
#-------------------------------------------------------
#Function1:进入淘宝界面
#-------------------------------------------------------


library(RSelenium)
remDr<-remoteDriver(browserName="chrome",remoteServerAddr="localhost",port=4445L)
remDr$open()
url<-"http://www.baidu.com"
remDr$navigate(url)
remDr$screenshot(display=F,useViewer=T,file="1.png")
input<-'//*[@id="kw"]'
bn<-remDr$findElement(using="xpath",value = input)
bn$highlightElement()
write.table(remDr$getPageSource(),"2.html")
remDr$mouseMoveToLocation(webElement=bn)
search<-list("淘宝",key="enter")#百度上检索淘宝
bn$sendKeysToElement(search)
taobaourl<-'//*[@id="1"]/h3/a[1]'
taobao<-remDr$findElement(using="xpath",taobaourl)
remDr$mouseMoveToLocation(webElement = taobao)
remDr$click()
#无法转化窗口,用自定义函数解决!
#remDr$switchToWindow(windowId = remDr$getWindowHandles()[[2]])
#remDr$getCurrentWindowHandle()
#转化窗口的函数
myswitch <- function (remDr, windowId) 
{
  qpath <- sprintf("%s/session/%s/window", remDr$serverURL, 
                   remDr$sessionInfo[["id"]])
  remDr$queryRD(qpath, "POST", qdata = list(handle = windowId))
}
myswitch(remDr, remDr$getWindowHandles()[[2]])
remDr$getCurrentUrl()
#-------------------------------------------------------
#Function2:完成淘宝的登录操作
#-------------------------------------------------------


search<-'//*[@id="q"]'
search<-remDr$findElement(using = "xpath",search)
write.table(remDr$getPageSource(),"1.html")
remDr$screenshot(file = "1.png")
remDr$mouseMoveToLocation(webElement = search)
search_key<-list("眼睫毛",key="enter")
search$sendKeysToElement(search_key)
#因未登录自动跳转到登录界面
remDr$getCurrentUrl()
login<-'//*[@id="J_Quick2Static"]'
login_ele<-remDr$findElement("xpath",login)
login_ele
remDr$mouseMoveToLocation(webElement = login_ele)
remDr$click()
login_user<-'//*[@id="TPL_username_1"]'
login_passwd<-'//*[@id="TPL_password_1"]'
login_user_ele<-remDr$findElement("xpath",login_user)
login_passwd_ele<-remDr$findElement("xpath",login_passwd)
user<-list("xxxx")
pass<-list("xxxxx")
login_user_ele$sendKeysToElement(user)
login_passwd_ele$sendKeysToElement(pass)
#注意:经测试,淘宝应该自带一些插件检测程序,在使用该插件模拟登录的时候会被检测到,要求验证,因此使用扫码登录
#同时也可以看出remDr与当前活跃页面是一致的,即使手动调整,remDr对象也会与最终调整的页面吻合
code<-'//*[@id="J_OtherLogin"]/a[2]'
code<-remDr$findElement(using = "xpath",code)
remDr$mouseMoveToLocation(webElement = code)
remDr$click()
remDr$screenshot(file="code.png")
panduan<-readline("是否已经完成扫码?T or F")
#获取二维码完成扫码后
if(panduan==T){
  print(remDr$getWindowHandles())
  print(remDr$getCurrentUrl())
}
#-------------------------------------------------------
#Function3:对跳转后的页面进行关键词检索
#-------------------------------------------------------


#对页面进行检索
keyword<-list("眼睫毛",key="enter")
key<-'//*[@id="q"]'
search<-remDr$findElement(using = "xpath",key)
search$clearElement()#清除原来的文本信息
search$sendKeysToElement(keyword)
#-------------------------------------------------------
#Function4:使用revest解析网页信息并批量爬取
#-------------------------------------------------------


#使用rvest对当前页面爬取
library(rvest)
web<-read_html(remDr$getPageSource()[[1]])
web<-html_nodes(web,xpath='//div[contains(@class,"price g_price g_price-highlight")]')
price<-html_text(web,trim = T)
price
#尝试爬取全部页面
#使用RSelenium+rvest策略爬取前100页
input<-'//*[@id="mainsrp-pager"]/div/div/div/div[2]/input'
information_all<-data.frame()
for (i in 1:100) {
  input1<-remDr$findElement(using = "xpath",input)
  input_key<-list(as.character(i),key="enter")
  input1$clearElement()
  input1$sendKeysToElement(input_key)
  web<-read_html(remDr$getPageSource()[[1]])
  keylist<-c(
    price<-'//div[contains(@class,"price g_price g_price-highlight")]',
    detail<-'//a[@class="J_ClickStat"]',
    shop<-'//span[@class="dsrs"]/following-sibling::span',
    location<-'//div[@class="location"]'
    
  )
  img_dir<-'//img[@class="J_ItemPic img"]'
  information<-sapply(keylist,function(x){x=html_nodes(web,xpath = x);x=html_text(x,trim = T);return(x)})
  img_dir_dir<-html_nodes(web,xpath = img_dir)
  img_dir_dir<-html_attr(img_dir_dir,name = "data-src")
  information<-cbind(img_dir_dir,information)
  information<-as.data.frame(information)
  colnames(information)<-c("img_dir",names(keylist))
  information_all<-rbind(information,information_all)
  Sys.sleep(5)
}
#保存信息
library(xlsx)
write.xlsx(information_all,file = "眼睫毛.xlsx")

推荐阅读更多精彩内容