RCurl包爬取豆瓣电影id和IMDB电影号id

爬取豆瓣id和IMDB_id


#输入电影名字、导演、演员信息，爬取豆瓣id和IMDB_id
#输入信息必须经过严格清洗，不允许出现空格，导演和演员字符长度严格小于等于6


#### 计算程序的运行时间
timestart<-Sys.time();
#打印开始时间
print(timestart)
####这块写你要运行的程序


#报头设置非常重要，爬虫一定要伪装，另外for循环一定要间隔休息
library(xlsx)
library(readxl)
library(plyr)
library(sqldf)
library(data.table)
library(RCurl)
library(XML)
library(stringr)
#伪装报头
myheader<-c(
  "User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
  "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language"="en-us",
  "Connection"="keep-alive",
  "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)


########定义编辑距离函数#############
Fun <- function(x,y){
  library(stringr)
  
  m <- str_length(x)
  n <- str_length(y)
  
  x <- str_split(x,pattern = "")[[1]];
  y <- str_split(y,pattern = "")[[1]];
  
  M <- matrix(0,nrow = m+1,ncol = n+1);
  rownames(M) <- c(" ",x)
  colnames(M) <- c(" ",y)
  
  for(i in 1:(m+1)) M[i,1] <- i-1; 
  for(j in 1:(n+1)) M[1,j] <- j-1; 
  
  for(i in 2:(m+1)){
    for(j in 2:(n+1)){
      if(x[i-1]==y[j-1]) cost=0 else cost=1;
      M[i,j]=min(M[i-1,j]+1,M[i,j-1]+1,M[i-1,j-1]+cost)
    }
  }
  #返回字符串的相似度
  return(round(1-M[m+1,n+1]/(m+n),2));
  
}





#url <- "https://movie.douban.com/"

#text=c("碟中谍","狮子王","魔戒3","星际穿越","火星救援","碟中谍2","职业特工队2","谍影重重2","碟中谍5")
#text="哈利波特与魔法石"
#text="少年派的奇幻漂流"
#text="哈利波特与死亡圣器(下)"
#text="手机"
#text <- t(c("加勒比海盗1：黑珍珠号的诅咒","冯小刚"," 张国立葛优范"))
#text <- t(c("哈利波特与死亡圣器(下)" ,        "大飞",   "廖智苗皓钧"))
#输入参数
#text <- y[1:100,c(1,2,3)]
#i=2
#抽样测试
#text <- text[sample(2901,200,replace = F),]

##################测试###########################
#text <- as.data.frame(t(z[1,]),stringsAsFactors = F)

###################匹配分类##############
#A <- "完全匹配"
#B <- "多个匹配但前五结果唯一"
#C <- "返回一个结果标题不匹配但详情页匹配"
#D <- "前五结果多个但是匹配上了"
#E <- "完全不匹配"
#F <- "前五结果多个但是没有匹配上"


url <- paste("https://movie.douban.com/subject_search?search_text=",text[,1],"&cat=1002",sep = "")
url_douban <- NULL;
url_douban_id <- NULL;
imdb_id <- NULL;
class <- NULL;



#输入数据英文小括号改写为正则表达式\\(和\\)
text[,1]<- str_replace(str_replace(text[,1],pattern = "\\(",replacement = "\\\\("),pattern = "\\)",replacement = "\\\\)");

#i=1
for(i in 1:length(url)){
  wp<-getURL(url[i],.encoding="utf-8",followlocation=T,httpheader=myheader)
  doc <- htmlParse(wp,asText=T,encoding="UTF-8")#解析
  #text[i]为电影名字
  
  title<- xpathSApply(doc,"//div[@class='pl2']//a",xmlValue)#搜索页所有的结果标题  
  title <- str_replace_all(title,pattern = "·",replacement = "")
  
  
  pipei <- str_detect(title,paste(" ",text[i,1]," {0,2}(\\(.{2,3}\\))?(：.{0,8})?","[\n | /]",sep = ""))
  
  #如果匹配列表个数等于1
  if(length(which(pipei))==1){
    subscript <- which(pipei)[1]#匹配上电影的下标
    
    url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[subscript]#进入搜索结果的链接
    url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
    wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
    doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
    if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))==0){ imdb_id[i] <- "000"}
    else{imdb_id[i] <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
    class[i] <- "A"
  }
  else if(length(which(pipei))>1){
    if(length(which(pipei[1:5]))==1){
      subscript <- which(pipei)[1]#匹配上电影的下标
      
      url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[subscript]#进入搜索结果的链接
      url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
      wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
      doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
      
        if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))==0) imdb_id[i] <- "000"
      else {imdb_id[i] <     xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
      class[i] <- "B"
    } 
    else{
      #########需要进行二次匹配###########
      url_pipei <-xpathSApply(doc,"//div[@class='pl2']//a",xmlGetAttr,"href")[which(pipei)]; 
      
      imdb_id_temp <- NULL;
      xishu <- NULL;
      for(n in 1:length(url_pipei)){
        
        wp1<-getURL(url_pipei[n],.encoding="utf-8",followlocation=T,httpheader=myheader)
        doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")
        #导演
        daoyan <- xpathSApply(doc1,path = "//div[@id='info']//span",xmlValue)[1]
        # if(is.null(daoyan)){daoyan <- ""}
        daoyan <- str_replace_all(daoyan,pattern = "导演:","")
        daoyan <- str_replace_all(daoyan,pattern = " ","")
        daoyan <- str_replace_all(daoyan,pattern = "/","")
        if(length(daoyan)==0){daoyan <- " "}
        if(str_length(daoyan)>6)
        {daoyan <- substring(daoyan,1,6)}
        
        #主演
        zhuyan <- xpathSApply(doc1,path = "//div[@id='info']//span[@class='actor']",xmlValue)
        if(length(zhuyan)==0){zhuyan <- "abcdef"}
        zhuyan <- str_replace_all(zhuyan,pattern = "主演:","")
        zhuyan <- str_replace_all(zhuyan,pattern = " ","")
        zhuyan <- str_replace_all(zhuyan,pattern = "/","")
        
        if(str_length(zhuyan)>6) {zhuyan <- substring(zhuyan,1,6)}
        
        p <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)
        
        if(length(p)==0){imdb_id_temp[n] <-"000"}
        else{imdb_id_temp[n] <-xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
        
        #加权系数计算
        xishu[n] <- 0.6*Fun(text[i,2],daoyan)+0.4*Fun(text[i,3],zhuyan)###需要将输入参数改为三个变量的数据框
        
      }
      subscript <- which.max(xishu)
      if(xishu[subscript]>0.5)
      {
        url_douban_id[i] <- str_split(url_pipei[subscript],pattern = "/")[[1]][5]
        imdb_id[i] <- imdb_id_temp[subscript]
        class[i] <- "D"
      }
      else{
        url_douban_id[i] <- 0;
        imdb_id[i] <- 0;
        class[i] <- "F"
      }
    }
  }
  else {
    if(length(pipei)==1)
    {
      url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[1]
      wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
      doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
      if(length(xpathSApply(doc1,path = "//div[@id='info']",xmlValue))!=0){
      text_another_name <- str_extract(xpathSApply(doc1,path = "//div[@id='info']",xmlValue),pattern = "又名:.*IMDb链接")
      if(is.na(text_another_name)){
        text_another_name <- "aaaaaa"
      }
      }
      else{
        text_another_name <- "aaaaaa"
      }
      if(str_detect(text_another_name,text[i,1]))
      {
        url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
       if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))!=0){
         imdb_id[i] <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
        else{imdb_id[i] <- "000"}
        class[i] <- "C"
      }
      else {
        url_douban_id[i] <- NA;
        imdb_id[i] <- NA;
        class[i] <- "E";
      }
    }
    else{
      url_douban_id[i] <- NA;
      imdb_id[i] <- NA;
      class[i] <- "E";
    }
  }  
  #每一次循环休息2秒左右
  Sys.sleep(2+runif(1,0,1))
}

#整理成数据框
x <- data.frame(text[,1],url_douban_id,imdb_id,class)


#如果匹配列表返回值前五个出现相同的匹配结果，则返回id=0;考虑将结果范围缩小到
#如果列表返回值是1，但是不匹配名称，则获得链接，进入详情信息页面
#对搜索列表的电影名称进行精简修改，注意英文名字需要加上分隔符，比如哈利波特、珀西杰克逊等
#标题第二个字段好像是没有进行匹配的，需要进行修改

#计算程序结束时间
timeend<-Sys.time()
#打印结束时间
print(timeend)
runningtime<-timeend-timestart
#输出时间消耗 
print(runningtime)

最后编辑于：2017.12.08 04:36:12

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 160,444评论 4赞 365
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 67,867评论 1赞 298
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 110,157评论 0赞 248
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 44,312评论 0赞 214
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 52,673评论 3赞 289
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 40,802评论 1赞 223
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 32,010评论 2赞 315
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 30,743评论 0赞 204
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 34,470评论 1赞 246
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 30,696评论 2赞 250
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 32,187评论 1赞 262
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 28,538评论 3赞 258
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 33,188评论 3赞 240
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 26,127评论 0赞 8
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 26,902评论 0赞 198
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 35,889评论 2赞 283
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 35,741评论 2赞 274

RCurl包爬取豆瓣电影id和IMDB电影号id

爬取豆瓣id和IMDB_id

推荐阅读更多精彩内容